Check in pre-generated perlasm and error data files

This adds a tool for managing pre-generated files, aligning our CMake
and non-CMake builds. The plan is roughly:

The source of truth for the file lists will (eventually) be build.json.
This describes the build in terms of the files that we directly edit.

However, we have a two-phase build. First a pregeneration step
transforms some of the less convenient inputs into checked in files.
Notably perlasm files get expanded. This produces an equivalent JSON
structure with fewer inputs. The same tool then outputs that structure
into whatever build systems we want.

This initial version pre-generates err_data.c and perlasm files. I've
not wired up the various build formats, except for CMake (for the CMake
build to consume) and JSON (for generate_build_files.py to parse).
build.json is also, for now, only a subset of the build. Later changes

The upshot of all this is we no longer have a Perl build dependency!
Perl is now only needed when working on BoringSSL. It nearly removes the
Go one, but Go is still needed to run and (for now) build the tests.

To keep the generated files up-to-date, once this lands, I'll update our
CI to run `go run ./util/pregenerate -check` which asserts that all
generated files are correct. From there we can land the later changes in
this patch series that uses this more extensively. My eventual goal is
to replace generate_build_files.py altogether and the
"master-with-bazel" branch. Instead we'll just have sources.bzl,
sources.gni, etc. all checked into the tree directly. And then the
normal branch will just have both a CMake and Bazel build in it.

Update-Note: generate_build_files.py no longer generates assembly files
or err_data.c. Those are now checked into the tree directly.

Bug: 542
Change-Id: I71f5ff7417be811f8b7888b345279474e6b38ee9
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/67288
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/BUILDING.md b/BUILDING.md
index fb28e89..e10d964 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -16,15 +16,6 @@
 
   * [CMake](https://cmake.org/download/) 3.12 or later is required.
 
-  * A recent version of Perl is required. On Windows,
-    [Active State Perl](http://www.activestate.com/activeperl/) has been
-    reported to work, as has MSYS Perl.
-    [Strawberry Perl](http://strawberryperl.com/) also works but it adds GCC
-    to `PATH`, which can confuse some build tools when identifying the compiler
-    (removing `C:\Strawberry\c\bin` from `PATH` should resolve any problems).
-    If Perl is not found by CMake, it may be configured explicitly by setting
-    `PERL_EXECUTABLE`.
-
   * Building with [Ninja](https://ninja-build.org/) instead of Make is
     recommended, because it makes builds faster. On Windows, CMake's Visual
     Studio generator may also work, but it not tested regularly and requires
@@ -211,3 +202,17 @@
 
 Both sets of tests may also be run with `ninja -C build run_tests`, but CMake
 3.2 or later is required to avoid Ninja's output buffering.
+
+# Pre-generated Files
+
+If modifying perlasm files, or `util/pregenerate/build.json`, you will need to
+run `go run ./util/pregenerate` to refresh some pre-generated files. To do this,
+you will need a recent version of Perl.
+
+On Windows, [Active State Perl](http://www.activestate.com/activeperl/) has been
+reported to work, as has MSYS Perl.
+[Strawberry Perl](http://strawberryperl.com/) also works but it adds GCC
+to `PATH`, which can confuse some build tools when identifying the compiler
+(removing `C:\Strawberry\c\bin` from `PATH` should resolve any problems).
+
+See (gen/README.md)[./gen/README.md] for more details.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59623e0..1410c43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 include(sources.cmake)
 include(cmake/go.cmake)
 include(cmake/paths.cmake)
-include(cmake/perlasm.cmake)
+include(gen/sources.cmake)
 
 enable_language(C)
 enable_language(CXX)
@@ -43,8 +43,6 @@
   endif()
 endfunction()
 
-find_package(Perl REQUIRED)
-
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND NOT CMAKE_CROSSCOMPILING)
   find_package(PkgConfig QUIET)
   if (PkgConfig_FOUND)
@@ -530,7 +528,13 @@
 target_link_libraries(decrepit crypto ssl)
 
 add_library(test_support_lib STATIC ${TEST_SUPPORT_SOURCES})
-if (LIBUNWIND_FOUND)
+if(OPENSSL_ASM)
+  target_sources(test_support_lib PRIVATE ${TEST_SUPPORT_SOURCES_ASM})
+endif()
+if(OPENSSL_NASM)
+  target_sources(test_support_lib PRIVATE ${TEST_SUPPORT_SOURCES_NASM})
+endif()
+if(LIBUNWIND_FOUND)
   target_compile_options(test_support_lib PRIVATE ${LIBUNWIND_CFLAGS_OTHER})
   target_include_directories(test_support_lib PRIVATE ${LIBUNWIND_INCLUDE_DIRS})
   target_link_libraries(test_support_lib ${LIBUNWIND_LDFLAGS})
diff --git a/build.json b/build.json
new file mode 100644
index 0000000..0bf49a7
--- /dev/null
+++ b/build.json
@@ -0,0 +1,136 @@
+// This file defines BoringSSL's build, expressed in terms of the input source
+// files that BoringSSL developers edit. It is a JSON file with line comments,
+// with line comments removed before parsing. It drives ./util/pregenerate which
+// converts some of those inputs (e.g. perlasm files) into pre-generated
+// outputs. This produces a more simplified build, which is then converted into
+// build files of various syntaxes.
+//
+// When modifying this file, run `go run ./util/pregenerate`. See gen/README.md
+// for more details, and util/pregenerate/build.go for the schema.
+//
+// TODO(crbug.com/boringssl/542): Moving build inputs to this file is still work
+// in progress, so this file is currently incomplete.
+{
+    "bcm": {
+        "perlasm_aarch64": [
+            {"src": "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "dst": "aesv8-armv8"},
+            {"src": "crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/armv8-mont.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/bn-armv8.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghashv8-armx.pl", "dst": "ghashv8-armv8"},
+            {"src": "crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl"},
+            {"src": "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha1-armv8.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "args": ["sha256"], "dst": "sha256-armv8"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "args": ["sha512"]},
+            {"src": "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"}
+        ],
+        "perlasm_arm": [
+            {"src": "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "dst": "aesv8-armv7"},
+            {"src": "crypto/fipsmodule/bn/asm/armv4-mont.pl"},
+            {"src": "crypto/fipsmodule/aes/asm/bsaes-armv7.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-armv4.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghashv8-armx.pl", "dst": "ghashv8-armv7"},
+            {"src": "crypto/fipsmodule/sha/asm/sha1-armv4-large.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha256-armv4.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-armv4.pl"},
+            {"src": "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"}
+        ],
+        "perlasm_x86": [
+            {"src": "crypto/fipsmodule/aes/asm/aesni-x86.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/bn-586.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/co-586.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-x86.pl"},
+            {"src": "crypto/fipsmodule/md5/asm/md5-586.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha1-586.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha256-586.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-586.pl"},
+            {"src": "crypto/fipsmodule/aes/asm/vpaes-x86.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/x86-mont.pl"}
+        ],
+        "perlasm_x86_64": [
+            {"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
+            {"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
+            {"src": "crypto/fipsmodule/md5/asm/md5-x86_64.pl"},
+            {"src": "crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl"},
+            {"src": "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"},
+            {"src": "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/rsaz-avx2.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha1-x86_64.pl"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "args": ["sha256"], "dst": "sha256-x86_64"},
+            {"src": "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "args": ["sha512"]},
+            {"src": "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/x86_64-mont.pl"},
+            {"src": "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"}
+        ]
+    },
+    "crypto": {
+        "err_data": [
+            "crypto/err/asn1.errordata",
+            "crypto/err/bio.errordata",
+            "crypto/err/bn.errordata",
+            "crypto/err/cipher.errordata",
+            "crypto/err/conf.errordata",
+            "crypto/err/dh.errordata",
+            "crypto/err/digest.errordata",
+            "crypto/err/dsa.errordata",
+            "crypto/err/ecdh.errordata",
+            "crypto/err/ecdsa.errordata",
+            "crypto/err/ec.errordata",
+            "crypto/err/engine.errordata",
+            "crypto/err/evp.errordata",
+            "crypto/err/hkdf.errordata",
+            "crypto/err/obj.errordata",
+            "crypto/err/pem.errordata",
+            "crypto/err/pkcs7.errordata",
+            "crypto/err/pkcs8.errordata",
+            "crypto/err/rsa.errordata",
+            "crypto/err/ssl.errordata",
+            "crypto/err/trust_token.errordata",
+            "crypto/err/x509.errordata",
+            "crypto/err/x509v3.errordata"
+        ],
+        "asm": [
+            "crypto/curve25519/asm/x25519-asm-arm.S",
+            "crypto/hrss/asm/poly_rq_mul.S",
+            "crypto/poly1305/poly1305_arm_asm.S",
+            "third_party/fiat/asm/fiat_curve25519_adx_mul.S",
+            "third_party/fiat/asm/fiat_curve25519_adx_square.S",
+            "third_party/fiat/asm/fiat_p256_adx_mul.S",
+            "third_party/fiat/asm/fiat_p256_adx_sqr.S"
+        ],
+        "perlasm_aarch64": [
+            {"src": "crypto/chacha/asm/chacha-armv8.pl"},
+            {"src": "crypto/cipher_extra/asm/chacha20_poly1305_armv8.pl"}
+        ],
+        "perlasm_arm": [
+            {"src": "crypto/chacha/asm/chacha-armv4.pl"}
+        ],
+        "perlasm_x86": [
+            {"src": "crypto/chacha/asm/chacha-x86.pl"}
+        ],
+        "perlasm_x86_64": [
+            {"src": "crypto/chacha/asm/chacha-x86_64.pl"},
+            {"src": "crypto/cipher_extra/asm/aes128gcmsiv-x86_64.pl"},
+            {"src": "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl"}
+        ]
+    },
+    "test_support": {
+        "perlasm_aarch64": [
+            {"src": "crypto/test/asm/trampoline-armv8.pl"}
+        ],
+        "perlasm_arm": [
+            {"src": "crypto/test/asm/trampoline-armv4.pl"}
+        ],
+        "perlasm_x86": [
+            {"src": "crypto/test/asm/trampoline-x86.pl"}
+        ],
+        "perlasm_x86_64": [
+            {"src": "crypto/test/asm/trampoline-x86_64.pl"}
+        ]
+    }
+}
diff --git a/cmake/perlasm.cmake b/cmake/perlasm.cmake
deleted file mode 100644
index 17a47b9..0000000
--- a/cmake/perlasm.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-macro(append_to_parent_scope var)
-  list(APPEND ${var} ${ARGN})
-  set(${var} "${${var}}" PARENT_SCOPE)
-endmacro()
-
-function(add_perlasm_target dest src)
-  get_filename_component(dir ${dest} DIRECTORY)
-  if(dir STREQUAL "")
-    set(dir ".")
-  endif()
-
-  add_custom_command(
-    OUTPUT ${dest}
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${dir}
-    COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${src} ${ARGN}
-            ${dest}
-    DEPENDS
-    ${src}
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/arm-xlate.pl
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86_64-xlate.pl
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86asm.pl
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86gas.pl
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86masm.pl
-    ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86nasm.pl
-    WORKING_DIRECTORY .
-  )
-endfunction()
-
-# perlasm generates perlasm output from a given file. arch specifies the
-# architecture. dest specifies the basename of the output file. The list of
-# generated files will be appended to ${var}_ASM and ${var}_NASM depending on
-# the assembler used. Extra arguments are passed to the perlasm script.
-function(perlasm var arch dest src)
-  if(arch STREQUAL "aarch64")
-    add_perlasm_target("${dest}-apple.S" ${src} ios64 ${ARGN})
-    add_perlasm_target("${dest}-linux.S" ${src} linux64 ${ARGN})
-    add_perlasm_target("${dest}-win.S" ${src} win64 ${ARGN})
-    append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S" "${dest}-win.S")
-  elseif(arch STREQUAL "arm")
-    add_perlasm_target("${dest}-linux.S" ${src} linux32 ${ARGN})
-    append_to_parent_scope("${var}_ASM" "${dest}-linux.S")
-  elseif(arch STREQUAL "x86")
-    add_perlasm_target("${dest}-apple.S" ${src} macosx -fPIC ${ARGN})
-    add_perlasm_target("${dest}-linux.S" ${src} elf -fPIC ${ARGN})
-    add_perlasm_target("${dest}-win.asm" ${src} win32n ${ARGN})
-    append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S")
-    append_to_parent_scope("${var}_NASM" "${dest}-win.asm")
-  elseif(arch STREQUAL "x86_64")
-    add_perlasm_target("${dest}-apple.S" ${src} macosx ${ARGN})
-    add_perlasm_target("${dest}-linux.S" ${src} elf ${ARGN})
-    add_perlasm_target("${dest}-win.asm" ${src} nasm ${ARGN})
-    append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S")
-    append_to_parent_scope("${var}_NASM" "${dest}-win.asm")
-  else()
-    message(FATAL_ERROR "Unknown perlasm architecture: $arch")
-  endif()
-endfunction()
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index bc32ef0..dbed8cb 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -1,72 +1,9 @@
-set(
-  CRYPTO_SOURCES_ASM
-  curve25519/asm/x25519-asm-arm.S
-  hrss/asm/poly_rq_mul.S
-  poly1305/poly1305_arm_asm.S
-  ../third_party/fiat/asm/fiat_curve25519_adx_mul.S
-  ../third_party/fiat/asm/fiat_curve25519_adx_square.S
-  ../third_party/fiat/asm/fiat_p256_adx_mul.S
-  ../third_party/fiat/asm/fiat_p256_adx_sqr.S
-)
-perlasm(CRYPTO_SOURCES aarch64 chacha/chacha-armv8 chacha/asm/chacha-armv8.pl)
-perlasm(CRYPTO_SOURCES aarch64 cipher_extra/chacha20_poly1305_armv8 cipher_extra/asm/chacha20_poly1305_armv8.pl)
-perlasm(CRYPTO_SOURCES aarch64 test/trampoline-armv8 test/asm/trampoline-armv8.pl)
-perlasm(CRYPTO_SOURCES arm chacha/chacha-armv4 chacha/asm/chacha-armv4.pl)
-perlasm(CRYPTO_SOURCES arm test/trampoline-armv4 test/asm/trampoline-armv4.pl)
-perlasm(CRYPTO_SOURCES x86 chacha/chacha-x86 chacha/asm/chacha-x86.pl)
-perlasm(CRYPTO_SOURCES x86 test/trampoline-x86 test/asm/trampoline-x86.pl)
-perlasm(CRYPTO_SOURCES x86_64 chacha/chacha-x86_64 chacha/asm/chacha-x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 cipher_extra/aes128gcmsiv-x86_64 cipher_extra/asm/aes128gcmsiv-x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 cipher_extra/chacha20_poly1305_x86_64 cipher_extra/asm/chacha20_poly1305_x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 test/trampoline-x86_64 test/asm/trampoline-x86_64.pl)
-
-perlasm(BCM_SOURCES aarch64 fipsmodule/aesv8-armv8 fipsmodule/aes/asm/aesv8-armx.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/aesv8-gcm-armv8 fipsmodule/modes/asm/aesv8-gcm-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/armv8-mont fipsmodule/bn/asm/armv8-mont.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/bn-armv8 fipsmodule/bn/asm/bn-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/ghash-neon-armv8 fipsmodule/modes/asm/ghash-neon-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/ghashv8-armv8 fipsmodule/modes/asm/ghashv8-armx.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/p256_beeu-armv8-asm fipsmodule/ec/asm/p256_beeu-armv8-asm.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/p256-armv8-asm fipsmodule/ec/asm/p256-armv8-asm.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha1-armv8 fipsmodule/sha/asm/sha1-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha256-armv8 fipsmodule/sha/asm/sha512-armv8.pl sha256)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha512-armv8 fipsmodule/sha/asm/sha512-armv8.pl sha512)
-perlasm(BCM_SOURCES aarch64 fipsmodule/vpaes-armv8 fipsmodule/aes/asm/vpaes-armv8.pl)
-perlasm(BCM_SOURCES arm fipsmodule/aesv8-armv7 fipsmodule/aes/asm/aesv8-armx.pl)
-perlasm(BCM_SOURCES arm fipsmodule/armv4-mont fipsmodule/bn/asm/armv4-mont.pl)
-perlasm(BCM_SOURCES arm fipsmodule/bsaes-armv7 fipsmodule/aes/asm/bsaes-armv7.pl)
-perlasm(BCM_SOURCES arm fipsmodule/ghash-armv4 fipsmodule/modes/asm/ghash-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/ghashv8-armv7 fipsmodule/modes/asm/ghashv8-armx.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha1-armv4-large fipsmodule/sha/asm/sha1-armv4-large.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha256-armv4 fipsmodule/sha/asm/sha256-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha512-armv4 fipsmodule/sha/asm/sha512-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/vpaes-armv7 fipsmodule/aes/asm/vpaes-armv7.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/aesni-x86 fipsmodule/aes/asm/aesni-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/bn-586 fipsmodule/bn/asm/bn-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/co-586 fipsmodule/bn/asm/co-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/ghash-ssse3-x86 fipsmodule/modes/asm/ghash-ssse3-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/ghash-x86 fipsmodule/modes/asm/ghash-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/md5-586 fipsmodule/md5/asm/md5-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha1-586 fipsmodule/sha/asm/sha1-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha256-586 fipsmodule/sha/asm/sha256-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha512-586 fipsmodule/sha/asm/sha512-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/vpaes-x86 fipsmodule/aes/asm/vpaes-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/x86-mont fipsmodule/bn/asm/x86-mont.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/aesni-gcm-x86_64 fipsmodule/modes/asm/aesni-gcm-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/aesni-x86_64 fipsmodule/aes/asm/aesni-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/ghash-ssse3-x86_64 fipsmodule/modes/asm/ghash-ssse3-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/ghash-x86_64 fipsmodule/modes/asm/ghash-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/md5-x86_64 fipsmodule/md5/asm/md5-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/p256_beeu-x86_64-asm fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/p256-x86_64-asm fipsmodule/ec/asm/p256-x86_64-asm.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/rdrand-x86_64 fipsmodule/rand/asm/rdrand-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/rsaz-avx2 fipsmodule/bn/asm/rsaz-avx2.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha1-x86_64 fipsmodule/sha/asm/sha1-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha256-x86_64 fipsmodule/sha/asm/sha512-x86_64.pl sha256)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha512-x86_64 fipsmodule/sha/asm/sha512-x86_64.pl sha512)
-perlasm(BCM_SOURCES x86_64 fipsmodule/vpaes-x86_64 fipsmodule/aes/asm/vpaes-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/x86_64-mont fipsmodule/bn/asm/x86_64-mont.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/x86_64-mont5 fipsmodule/bn/asm/x86_64-mont5.pl)
+# TODO(crbug.com/boringssl/524): Avoid needing this transform by instead moving
+# this up a directory.
+list(TRANSFORM BCM_SOURCES_ASM PREPEND "../")
+list(TRANSFORM BCM_SOURCES_NASM PREPEND "../")
+list(TRANSFORM CRYPTO_SOURCES_ASM PREPEND "../")
+list(TRANSFORM CRYPTO_SOURCES_NASM PREPEND "../")
 
 if(OPENSSL_ASM)
   list(APPEND CRYPTO_SOURCES_ASM_USED ${CRYPTO_SOURCES_ASM})
@@ -77,37 +14,6 @@
   list(APPEND BCM_SOURCES_ASM_USED ${BCM_SOURCES_NASM})
 endif()
 
-add_custom_command(
-  OUTPUT err_data.c
-  COMMAND ${GO_EXECUTABLE} run err_data_generate.go > ${CMAKE_CURRENT_BINARY_DIR}/err_data.c
-  DEPENDS
-  err/err_data_generate.go
-  err/asn1.errordata
-  err/bio.errordata
-  err/bn.errordata
-  err/cipher.errordata
-  err/conf.errordata
-  err/dh.errordata
-  err/digest.errordata
-  err/dsa.errordata
-  err/ecdh.errordata
-  err/ecdsa.errordata
-  err/ec.errordata
-  err/engine.errordata
-  err/evp.errordata
-  err/hkdf.errordata
-  err/obj.errordata
-  err/pem.errordata
-  err/pkcs7.errordata
-  err/pkcs8.errordata
-  err/rsa.errordata
-  err/ssl.errordata
-  err/trust_token.errordata
-  err/x509.errordata
-  err/x509v3.errordata
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/err
-)
-
 if(FIPS_DELOCATE AND FIPS_SHARED)
   message(FATAL_ERROR "Can't set both delocate and shared mode for FIPS build")
 endif()
@@ -128,9 +34,9 @@
   add_custom_command(
     OUTPUT bcm-delocated.S
     COMMAND
-    ./delocate
+    ${CMAKE_CURRENT_BINARY_DIR}/delocate
     -a $<TARGET_FILE:bcm_c_generated_asm>
-    -o bcm-delocated.S
+    -o ${CMAKE_CURRENT_BINARY_DIR}/bcm-delocated.S
     -cc ${CMAKE_ASM_COMPILER}
     -cc-flags "${TARGET_FLAG} ${CMAKE_ASM_FLAGS}"
     ${PROJECT_SOURCE_DIR}/include/openssl/arm_arch.h
@@ -144,7 +50,7 @@
     ${PROJECT_SOURCE_DIR}/include/openssl/arm_arch.h
     ${PROJECT_SOURCE_DIR}/include/openssl/asm_base.h
     ${PROJECT_SOURCE_DIR}/include/openssl/target.h
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   )
 
   add_library(bcm_hashunset STATIC bcm-delocated.S)
@@ -272,7 +178,6 @@
   ec_extra/ec_derive.c
   ec_extra/hash_to_curve.c
   err/err.c
-  err_data.c
   engine/engine.c
   evp/evp.c
   evp/evp_asn1.c
@@ -419,6 +324,9 @@
   x509/x509name.c
   x509/x509rset.c
   x509/x509spki.c
+  # TOOD(crbug.com/boringssl/542): Pick up this and the rest of the source list
+  # from util/pregenerate.
+  ../gen/crypto/err_data.c
 
   ${CRYPTO_FIPS_OBJECTS}
   ${CRYPTO_SOURCES_ASM_USED}
diff --git a/gen/README.md b/gen/README.md
new file mode 100644
index 0000000..3ab6ec4
--- /dev/null
+++ b/gen/README.md
@@ -0,0 +1,26 @@
+# Pre-generated files
+
+This directory contains a number of pre-generated build artifacts. To simplify
+downstream builds, they are checked into the repository, rather than dynamically
+generated as part of the build.
+
+When developing on BoringSSL, if any inputs to these files are modified, callers
+must run the following command to update the generated files:
+
+    go run ./util/pregenerate
+
+To check that files are up-to-date without updating files, run:
+
+    go run ./util/pregenerate -check
+
+This is run on CI to ensure the generated files remain up-to-date.
+
+To speed up local iteration, the tool accepts additional arguments to filter the
+files generated. For example, if editing `aesni-x86_64.pl`, this
+command will only update files with "aesni-x86_64" as a substring.
+
+    go run ./util/pregenerate aesni-x86_64
+
+For convenience, all files in this directory, including this README, are managed
+by the tool. This means the whole directory may be deleted and regenerated from
+scratch at any time.
diff --git a/gen/bcm/aesni-gcm-x86_64-apple.S b/gen/bcm/aesni-gcm-x86_64-apple.S
new file mode 100644
index 0000000..e1247bc
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-apple.S
@@ -0,0 +1,868 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+.p2align	5
+_aesni_ctr32_ghash_6x:
+
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	L$oop6x
+
+.p2align	5
+L$oop6x:
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%r10d
+	jb	L$enc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	L$enc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	L$enc_tail
+
+.p2align	5
+L$handle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	L$resume_ctr32
+
+.p2align	5
+L$enc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+
+	prefetcht0	512(%rdi)
+	prefetcht0	576(%rdi)
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%rax
+	subq	$0x6,%rdx
+	jc	L$6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	L$oop6x
+
+L$6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	ret
+
+
+.globl	_aesni_gcm_decrypt
+.private_extern _aesni_gcm_decrypt
+
+.p2align	5
+_aesni_gcm_decrypt:
+
+
+_CET_ENDBR
+	xorq	%rax,%rax
+
+
+
+	cmpq	$0x60,%rdx
+	jb	L$gcm_dec_abort
+
+	pushq	%rbp
+
+
+	movq	%rsp,%rbp
+
+	pushq	%rbx
+
+
+	pushq	%r12
+
+
+	pushq	%r13
+
+
+	pushq	%r14
+
+
+	pushq	%r15
+
+
+	vzeroupper
+
+	movq	16(%rbp),%r12
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r12),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32(%r9),%r9
+	movl	240-128(%rcx),%r10d
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$dec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$dec_no_key_aliasing
+	subq	%r15,%rsp
+L$dec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	movq	%rdi,%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%rax,%rax
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	movq	16(%rbp),%r12
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+L$gcm_dec_abort:
+	ret
+
+
+
+
+.p2align	5
+_aesni_ctr32_6x:
+
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%r10),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+.p2align	4
+L$oop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	L$oop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	ret
+.p2align	5
+L$handle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+
+
+.globl	_aesni_gcm_encrypt
+.private_extern _aesni_gcm_encrypt
+
+.p2align	5
+_aesni_gcm_encrypt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+2(%rip)
+#endif
+	xorq	%rax,%rax
+
+
+
+
+	cmpq	$288,%rdx
+	jb	L$gcm_enc_abort
+
+	pushq	%rbp
+
+
+	movq	%rsp,%rbp
+
+	pushq	%rbx
+
+
+	pushq	%r12
+
+
+	pushq	%r13
+
+
+	pushq	%r14
+
+
+	pushq	%r15
+
+
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%r10d
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$enc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$enc_no_key_aliasing
+	subq	%r15,%rsp
+L$enc_no_key_aliasing:
+
+	movq	%rsi,%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	movq	16(%rbp),%r12
+	leaq	32(%r9),%r9
+	vmovdqu	(%r12),%xmm8
+	subq	$12,%rdx
+	movq	$192,%rax
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	movq	16(%rbp),%r12
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+L$gcm_enc_abort:
+	ret
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/gen/bcm/aesni-gcm-x86_64-linux.S b/gen/bcm/aesni-gcm-x86_64-linux.S
new file mode 100644
index 0000000..774a8d1
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-linux.S
@@ -0,0 +1,883 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc	
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%r10d
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+
+	prefetcht0	512(%rdi)
+	prefetcht0	576(%rdi)
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%rax
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	ret
+.cfi_endproc	
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl	aesni_gcm_decrypt
+.hidden aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc	
+
+_CET_ENDBR
+	xorq	%rax,%rax
+
+
+
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	pushq	%rbx
+.cfi_offset	%rbx,-24
+
+	pushq	%r12
+.cfi_offset	%r12,-32
+
+	pushq	%r13
+.cfi_offset	%r13,-40
+
+	pushq	%r14
+.cfi_offset	%r14,-48
+
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	vzeroupper
+
+	movq	16(%rbp),%r12
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r12),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32(%r9),%r9
+	movl	240-128(%rcx),%r10d
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	movq	%rdi,%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%rax,%rax
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	movq	16(%rbp),%r12
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+.cfi_def_cfa	%rsp, 0x38
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+.Lgcm_dec_abort:
+	ret
+
+.cfi_endproc	
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc	
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%r10),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.cfi_endproc	
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+2(%rip)
+#endif
+	xorq	%rax,%rax
+
+
+
+
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	pushq	%rbx
+.cfi_offset	%rbx,-24
+
+	pushq	%r12
+.cfi_offset	%r12,-32
+
+	pushq	%r13
+.cfi_offset	%r13,-40
+
+	pushq	%r14
+.cfi_offset	%r14,-48
+
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%r10d
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	movq	%rsi,%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	movq	16(%rbp),%r12
+	leaq	32(%r9),%r9
+	vmovdqu	(%r12),%xmm8
+	subq	$12,%rdx
+	movq	$192,%rax
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	movq	16(%rbp),%r12
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12)
+
+	vzeroupper
+	leaq	-40(%rbp),%rsp
+.cfi_def_cfa	%rsp, 0x38
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+.Lgcm_enc_abort:
+	ret
+
+.cfi_endproc	
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/gen/bcm/aesni-gcm-x86_64-win.asm b/gen/bcm/aesni-gcm-x86_64-win.asm
new file mode 100644
index 0000000..d7a2665
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-win.asm
@@ -0,0 +1,1101 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+ALIGN	32
+_aesni_ctr32_ghash_6x:
+
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	sub	r8,6
+	vpxor	xmm4,xmm4,xmm4
+	vmovdqu	xmm15,XMMWORD[((0-128))+r9]
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpaddb	xmm12,xmm11,xmm2
+	vpaddb	xmm13,xmm12,xmm2
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm9,xmm1,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
+	jmp	NEAR $L$oop6x
+
+ALIGN	32
+$L$oop6x:
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm10,xmm10,xmm15
+	vpxor	xmm11,xmm11,xmm15
+
+$L$resume_ctr32:
+	vmovdqu	XMMWORD[rdi],xmm1
+	vpclmulqdq	xmm5,xmm7,xmm3,0x10
+	vpxor	xmm12,xmm12,xmm15
+	vmovups	xmm2,XMMWORD[((16-128))+r9]
+	vpclmulqdq	xmm6,xmm7,xmm3,0x01
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xor	r12,r12
+	cmp	r15,r14
+
+	vaesenc	xmm9,xmm9,xmm2
+	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
+	vpxor	xmm13,xmm13,xmm15
+	vpclmulqdq	xmm1,xmm7,xmm3,0x00
+	vaesenc	xmm10,xmm10,xmm2
+	vpxor	xmm14,xmm14,xmm15
+	setnc	r12b
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vaesenc	xmm11,xmm11,xmm2
+	vmovdqu	xmm3,XMMWORD[((16-32))+rsi]
+	neg	r12
+	vaesenc	xmm12,xmm12,xmm2
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm3,0x00
+	vpxor	xmm8,xmm8,xmm4
+	vaesenc	xmm13,xmm13,xmm2
+	vpxor	xmm4,xmm1,xmm5
+	and	r12,0x60
+	vmovups	xmm15,XMMWORD[((32-128))+r9]
+	vpclmulqdq	xmm1,xmm0,xmm3,0x10
+	vaesenc	xmm14,xmm14,xmm2
+
+	vpclmulqdq	xmm2,xmm0,xmm3,0x01
+	lea	r14,[r12*1+r14]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpclmulqdq	xmm3,xmm0,xmm3,0x11
+	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[88+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[80+r14]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((32+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((40+8))+rsp],r12
+	vmovdqu	xmm5,XMMWORD[((48-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((48-128))+r9]
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm5,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm5,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm7,xmm7,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm5,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	vpclmulqdq	xmm5,xmm0,xmm5,0x11
+	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vmovdqu	xmm1,XMMWORD[((64-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((64-128))+r9]
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm1,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm1,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[72+r14]
+	vpxor	xmm7,xmm7,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm1,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[64+r14]
+	vpclmulqdq	xmm1,xmm0,xmm1,0x11
+	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((48+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((56+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm2,XMMWORD[((96-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((80-128))+r9]
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm2,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm2,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[56+r14]
+	vpxor	xmm7,xmm7,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm2,0x01
+	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[48+r14]
+	vpclmulqdq	xmm2,xmm0,xmm2,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((64+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((72+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm3
+	vmovdqu	xmm3,XMMWORD[((112-32))+rsi]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((96-128))+r9]
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm8,xmm3,0x10
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm8,xmm3,0x01
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[40+r14]
+	vpxor	xmm7,xmm7,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm3,0x00
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[32+r14]
+	vpclmulqdq	xmm8,xmm8,xmm3,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((80+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((88+8))+rsp],r12
+	vpxor	xmm6,xmm6,xmm5
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm6,xmm6,xmm1
+
+	vmovups	xmm15,XMMWORD[((112-128))+r9]
+	vpslldq	xmm5,xmm6,8
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm3,XMMWORD[16+r11]
+
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm7,xmm7,xmm8
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm4,xmm4,xmm5
+	movbe	r13,QWORD[24+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[16+r14]
+	vpalignr	xmm0,xmm4,xmm4,8
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	mov	QWORD[((96+8))+rsp],r13
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((104+8))+rsp],r12
+	vaesenc	xmm13,xmm13,xmm15
+	vmovups	xmm1,XMMWORD[((128-128))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vmovups	xmm15,XMMWORD[((144-128))+r9]
+	vaesenc	xmm10,xmm10,xmm1
+	vpsrldq	xmm6,xmm6,8
+	vaesenc	xmm11,xmm11,xmm1
+	vpxor	xmm7,xmm7,xmm6
+	vaesenc	xmm12,xmm12,xmm1
+	vpxor	xmm4,xmm4,xmm0
+	movbe	r13,QWORD[8+r14]
+	vaesenc	xmm13,xmm13,xmm1
+	movbe	r12,QWORD[r14]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((160-128))+r9]
+	cmp	r10d,11
+	jb	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((176-128))+r9]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((192-128))+r9]
+	je	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((208-128))+r9]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((224-128))+r9]
+	jmp	NEAR $L$enc_tail
+
+ALIGN	32
+$L$handle_ctr32:
+	vmovdqu	xmm0,XMMWORD[r11]
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm15
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm15
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpshufb	xmm14,xmm14,xmm0
+	vpshufb	xmm1,xmm1,xmm0
+	jmp	NEAR $L$resume_ctr32
+
+ALIGN	32
+$L$enc_tail:
+	vaesenc	xmm9,xmm9,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
+	vpalignr	xmm8,xmm4,xmm4,8
+	vaesenc	xmm10,xmm10,xmm15
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	vpxor	xmm2,xmm1,XMMWORD[rcx]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm0,xmm1,XMMWORD[16+rcx]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm5,xmm1,XMMWORD[32+rcx]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm6,xmm1,XMMWORD[48+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm7,xmm1,XMMWORD[64+rcx]
+	vpxor	xmm3,xmm1,XMMWORD[80+rcx]
+	vmovdqu	xmm1,XMMWORD[rdi]
+
+	vaesenclast	xmm9,xmm9,xmm2
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	vaesenclast	xmm10,xmm10,xmm0
+	vpaddb	xmm0,xmm1,xmm2
+	mov	QWORD[((112+8))+rsp],r13
+	lea	rcx,[96+rcx]
+
+	prefetcht0	[512+rcx]
+	prefetcht0	[576+rcx]
+	vaesenclast	xmm11,xmm11,xmm5
+	vpaddb	xmm5,xmm0,xmm2
+	mov	QWORD[((120+8))+rsp],r12
+	lea	rdx,[96+rdx]
+	vmovdqu	xmm15,XMMWORD[((0-128))+r9]
+	vaesenclast	xmm12,xmm12,xmm6
+	vpaddb	xmm6,xmm5,xmm2
+	vaesenclast	xmm13,xmm13,xmm7
+	vpaddb	xmm7,xmm6,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vpaddb	xmm3,xmm7,xmm2
+
+	add	rax,0x60
+	sub	r8,0x6
+	jc	NEAR $L$6x_done
+
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vpxor	xmm9,xmm1,xmm15
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vmovdqa	xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vmovdqa	xmm11,xmm5
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vmovdqa	xmm12,xmm6
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vmovdqa	xmm13,xmm7
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+	vmovdqa	xmm14,xmm3
+	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
+	jmp	NEAR $L$oop6x
+
+$L$6x_done:
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpxor	xmm8,xmm8,xmm4
+
+	ret
+
+
+global	aesni_gcm_decrypt
+
+ALIGN	32
+aesni_gcm_decrypt:
+
+$L$SEH_begin_aesni_gcm_decrypt_1:
+_CET_ENDBR
+	xor	rax,rax
+
+
+
+	cmp	r8,0x60
+	jb	NEAR $L$gcm_dec_abort
+
+	push	rbp
+
+$L$SEH_prolog_aesni_gcm_decrypt_2:
+	mov	rbp,rsp
+
+	push	rbx
+
+$L$SEH_prolog_aesni_gcm_decrypt_3:
+	push	r12
+
+$L$SEH_prolog_aesni_gcm_decrypt_4:
+	push	r13
+
+$L$SEH_prolog_aesni_gcm_decrypt_5:
+	push	r14
+
+$L$SEH_prolog_aesni_gcm_decrypt_6:
+	push	r15
+
+$L$SEH_prolog_aesni_gcm_decrypt_7:
+	lea	rsp,[((-168))+rsp]
+$L$SEH_prolog_aesni_gcm_decrypt_8:
+$L$SEH_prolog_aesni_gcm_decrypt_9:
+
+
+
+	mov	QWORD[16+rbp],rdi
+$L$SEH_prolog_aesni_gcm_decrypt_10:
+	mov	QWORD[24+rbp],rsi
+$L$SEH_prolog_aesni_gcm_decrypt_11:
+	mov	rdi,QWORD[48+rbp]
+	mov	rsi,QWORD[56+rbp]
+
+	movaps	XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prolog_aesni_gcm_decrypt_12:
+	movaps	XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prolog_aesni_gcm_decrypt_13:
+	movaps	XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prolog_aesni_gcm_decrypt_14:
+	movaps	XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prolog_aesni_gcm_decrypt_15:
+	movaps	XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prolog_aesni_gcm_decrypt_16:
+	movaps	XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prolog_aesni_gcm_decrypt_17:
+	movaps	XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prolog_aesni_gcm_decrypt_18:
+	movaps	XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prolog_aesni_gcm_decrypt_19:
+	movaps	XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prolog_aesni_gcm_decrypt_20:
+	movaps	XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prolog_aesni_gcm_decrypt_21:
+	vzeroupper
+
+	mov	r12,QWORD[64+rbp]
+	vmovdqu	xmm1,XMMWORD[rdi]
+	add	rsp,-128
+	mov	ebx,DWORD[12+rdi]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+r9]
+	mov	r15,0xf80
+	vmovdqu	xmm8,XMMWORD[r12]
+	and	rsp,-128
+	vmovdqu	xmm0,XMMWORD[r11]
+	lea	r9,[128+r9]
+	lea	rsi,[32+rsi]
+	mov	r10d,DWORD[((240-128))+r9]
+	vpshufb	xmm8,xmm8,xmm0
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$dec_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$dec_no_key_aliasing
+	sub	rsp,r15
+$L$dec_no_key_aliasing:
+
+	vmovdqu	xmm7,XMMWORD[80+rcx]
+	mov	r14,rcx
+	vmovdqu	xmm4,XMMWORD[64+rcx]
+
+
+
+
+
+
+
+	lea	r15,[((-192))+r8*1+rcx]
+
+	vmovdqu	xmm5,XMMWORD[48+rcx]
+	shr	r8,4
+	xor	rax,rax
+	vmovdqu	xmm6,XMMWORD[32+rcx]
+	vpshufb	xmm7,xmm7,xmm0
+	vmovdqu	xmm2,XMMWORD[16+rcx]
+	vpshufb	xmm4,xmm4,xmm0
+	vmovdqu	xmm3,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm4
+	vpshufb	xmm6,xmm6,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm2,xmm2,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm6
+	vpshufb	xmm3,xmm3,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vmovdqu	XMMWORD[112+rsp],xmm3
+
+	call	_aesni_ctr32_ghash_6x
+
+	mov	r12,QWORD[64+rbp]
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[r12],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-208))+rbp]
+	movaps	xmm7,XMMWORD[((-192))+rbp]
+	movaps	xmm8,XMMWORD[((-176))+rbp]
+	movaps	xmm9,XMMWORD[((-160))+rbp]
+	movaps	xmm10,XMMWORD[((-144))+rbp]
+	movaps	xmm11,XMMWORD[((-128))+rbp]
+	movaps	xmm12,XMMWORD[((-112))+rbp]
+	movaps	xmm13,XMMWORD[((-96))+rbp]
+	movaps	xmm14,XMMWORD[((-80))+rbp]
+	movaps	xmm15,XMMWORD[((-64))+rbp]
+	mov	rdi,QWORD[16+rbp]
+	mov	rsi,QWORD[24+rbp]
+	lea	rsp,[((-40))+rbp]
+
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+$L$gcm_dec_abort:
+	ret
+$L$SEH_end_aesni_gcm_decrypt_22:
+
+
+
+ALIGN	32
+_aesni_ctr32_6x:
+
+	vmovdqu	xmm4,XMMWORD[((0-128))+r9]
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	lea	r13,[((-1))+r10]
+	vmovups	xmm15,XMMWORD[((16-128))+r9]
+	lea	r12,[((32-128))+r9]
+	vpxor	xmm9,xmm1,xmm4
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32_2
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpxor	xmm10,xmm10,xmm4
+	vpaddb	xmm12,xmm11,xmm2
+	vpxor	xmm11,xmm11,xmm4
+	vpaddb	xmm13,xmm12,xmm2
+	vpxor	xmm12,xmm12,xmm4
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm13,xmm13,xmm4
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+ALIGN	16
+$L$oop_ctr32:
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+	vmovups	xmm15,XMMWORD[r12]
+	lea	r12,[16+r12]
+	dec	r13d
+	jnz	NEAR $L$oop_ctr32
+
+	vmovdqu	xmm3,XMMWORD[r12]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm4,xmm3,XMMWORD[rcx]
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm5,xmm3,XMMWORD[16+rcx]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm6,xmm3,XMMWORD[32+rcx]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm8,xmm3,XMMWORD[48+rcx]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm2,xmm3,XMMWORD[64+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm3,xmm3,XMMWORD[80+rcx]
+	lea	rcx,[96+rcx]
+
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm5
+	vaesenclast	xmm11,xmm11,xmm6
+	vaesenclast	xmm12,xmm12,xmm8
+	vaesenclast	xmm13,xmm13,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vmovups	XMMWORD[rdx],xmm9
+	vmovups	XMMWORD[16+rdx],xmm10
+	vmovups	XMMWORD[32+rdx],xmm11
+	vmovups	XMMWORD[48+rdx],xmm12
+	vmovups	XMMWORD[64+rdx],xmm13
+	vmovups	XMMWORD[80+rdx],xmm14
+	lea	rdx,[96+rdx]
+
+	ret
+ALIGN	32
+$L$handle_ctr32_2:
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm4
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm4
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	xmm12,xmm12,xmm4
+	vpshufb	xmm14,xmm14,xmm0
+	vpxor	xmm13,xmm13,xmm4
+	vpshufb	xmm1,xmm1,xmm0
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+
+
+global	aesni_gcm_encrypt
+
+ALIGN	32
+aesni_gcm_encrypt:
+
+$L$SEH_begin_aesni_gcm_encrypt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+2))],1
+%endif
+	xor	rax,rax
+
+
+
+
+	cmp	r8,0x60*3
+	jb	NEAR $L$gcm_enc_abort
+
+	push	rbp
+
+$L$SEH_prolog_aesni_gcm_encrypt_2:
+	mov	rbp,rsp
+
+	push	rbx
+
+$L$SEH_prolog_aesni_gcm_encrypt_3:
+	push	r12
+
+$L$SEH_prolog_aesni_gcm_encrypt_4:
+	push	r13
+
+$L$SEH_prolog_aesni_gcm_encrypt_5:
+	push	r14
+
+$L$SEH_prolog_aesni_gcm_encrypt_6:
+	push	r15
+
+$L$SEH_prolog_aesni_gcm_encrypt_7:
+	lea	rsp,[((-168))+rsp]
+$L$SEH_prolog_aesni_gcm_encrypt_8:
+$L$SEH_prolog_aesni_gcm_encrypt_9:
+
+
+
+	mov	QWORD[16+rbp],rdi
+$L$SEH_prolog_aesni_gcm_encrypt_10:
+	mov	QWORD[24+rbp],rsi
+$L$SEH_prolog_aesni_gcm_encrypt_11:
+	mov	rdi,QWORD[48+rbp]
+	mov	rsi,QWORD[56+rbp]
+
+	movaps	XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prolog_aesni_gcm_encrypt_12:
+	movaps	XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prolog_aesni_gcm_encrypt_13:
+	movaps	XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prolog_aesni_gcm_encrypt_14:
+	movaps	XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prolog_aesni_gcm_encrypt_15:
+	movaps	XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prolog_aesni_gcm_encrypt_16:
+	movaps	XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prolog_aesni_gcm_encrypt_17:
+	movaps	XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prolog_aesni_gcm_encrypt_18:
+	movaps	XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prolog_aesni_gcm_encrypt_19:
+	movaps	XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prolog_aesni_gcm_encrypt_20:
+	movaps	XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prolog_aesni_gcm_encrypt_21:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[rdi]
+	add	rsp,-128
+	mov	ebx,DWORD[12+rdi]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+r9]
+	mov	r15,0xf80
+	lea	r9,[128+r9]
+	vmovdqu	xmm0,XMMWORD[r11]
+	and	rsp,-128
+	mov	r10d,DWORD[((240-128))+r9]
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$enc_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$enc_no_key_aliasing
+	sub	rsp,r15
+$L$enc_no_key_aliasing:
+
+	mov	r14,rdx
+
+
+
+
+
+
+
+
+	lea	r15,[((-192))+r8*1+rdx]
+
+	shr	r8,4
+
+	call	_aesni_ctr32_6x
+	vpshufb	xmm8,xmm9,xmm0
+	vpshufb	xmm2,xmm10,xmm0
+	vmovdqu	XMMWORD[112+rsp],xmm8
+	vpshufb	xmm4,xmm11,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vpshufb	xmm5,xmm12,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm4
+	vpshufb	xmm6,xmm13,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm7,xmm14,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm6
+
+	call	_aesni_ctr32_6x
+
+	mov	r12,QWORD[64+rbp]
+	lea	rsi,[32+rsi]
+	vmovdqu	xmm8,XMMWORD[r12]
+	sub	r8,12
+	mov	rax,0x60*2
+	vpshufb	xmm8,xmm8,xmm0
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	xmm7,XMMWORD[32+rsp]
+	vmovdqu	xmm0,XMMWORD[r11]
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpunpckhqdq	xmm1,xmm7,xmm7
+	vmovdqu	xmm15,XMMWORD[((32-32))+rsi]
+	vmovups	XMMWORD[(-96)+rdx],xmm9
+	vpshufb	xmm9,xmm9,xmm0
+	vpxor	xmm1,xmm1,xmm7
+	vmovups	XMMWORD[(-80)+rdx],xmm10
+	vpshufb	xmm10,xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rdx],xmm11
+	vpshufb	xmm11,xmm11,xmm0
+	vmovups	XMMWORD[(-48)+rdx],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vmovups	XMMWORD[(-32)+rdx],xmm13
+	vpshufb	xmm13,xmm13,xmm0
+	vmovups	XMMWORD[(-16)+rdx],xmm14
+	vpshufb	xmm14,xmm14,xmm0
+	vmovdqu	XMMWORD[16+rsp],xmm9
+	vmovdqu	xmm6,XMMWORD[48+rsp]
+	vmovdqu	xmm0,XMMWORD[((16-32))+rsi]
+	vpunpckhqdq	xmm2,xmm6,xmm6
+	vpclmulqdq	xmm5,xmm7,xmm3,0x00
+	vpxor	xmm2,xmm2,xmm6
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+
+	vmovdqu	xmm9,XMMWORD[64+rsp]
+	vpclmulqdq	xmm4,xmm6,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm5,xmm9,xmm9
+	vpclmulqdq	xmm6,xmm6,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm9
+	vpxor	xmm6,xmm6,xmm7
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+rsi]
+	vpxor	xmm2,xmm2,xmm1
+
+	vmovdqu	xmm1,XMMWORD[80+rsp]
+	vpclmulqdq	xmm7,xmm9,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+rsi]
+	vpxor	xmm7,xmm7,xmm4
+	vpunpckhqdq	xmm4,xmm1,xmm1
+	vpclmulqdq	xmm9,xmm9,xmm3,0x11
+	vpxor	xmm4,xmm4,xmm1
+	vpxor	xmm9,xmm9,xmm6
+	vpclmulqdq	xmm5,xmm5,xmm15,0x00
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm2,XMMWORD[96+rsp]
+	vpclmulqdq	xmm6,xmm1,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+rsi]
+	vpxor	xmm6,xmm6,xmm7
+	vpunpckhqdq	xmm7,xmm2,xmm2
+	vpclmulqdq	xmm1,xmm1,xmm0,0x11
+	vpxor	xmm7,xmm7,xmm2
+	vpxor	xmm1,xmm1,xmm9
+	vpclmulqdq	xmm4,xmm4,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+
+	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
+	vpclmulqdq	xmm5,xmm2,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+rsi]
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm2,xmm2,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm2,xmm2,xmm1
+	vpclmulqdq	xmm7,xmm7,xmm15,0x00
+	vpxor	xmm4,xmm7,xmm4
+
+	vpclmulqdq	xmm6,xmm8,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((0-32))+rsi]
+	vpunpckhqdq	xmm1,xmm14,xmm14
+	vpclmulqdq	xmm8,xmm8,xmm0,0x11
+	vpxor	xmm1,xmm1,xmm14
+	vpxor	xmm5,xmm6,xmm5
+	vpclmulqdq	xmm9,xmm9,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((32-32))+rsi]
+	vpxor	xmm7,xmm8,xmm2
+	vpxor	xmm6,xmm9,xmm4
+
+	vmovdqu	xmm0,XMMWORD[((16-32))+rsi]
+	vpxor	xmm9,xmm7,xmm5
+	vpclmulqdq	xmm4,xmm14,xmm3,0x00
+	vpxor	xmm6,xmm6,xmm9
+	vpunpckhqdq	xmm2,xmm13,xmm13
+	vpclmulqdq	xmm14,xmm14,xmm3,0x11
+	vpxor	xmm2,xmm2,xmm13
+	vpslldq	xmm9,xmm6,8
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+	vpxor	xmm8,xmm5,xmm9
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm7,xmm7,xmm6
+
+	vpclmulqdq	xmm5,xmm13,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+rsi]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm9,xmm12,xmm12
+	vpclmulqdq	xmm13,xmm13,xmm0,0x11
+	vpxor	xmm9,xmm9,xmm12
+	vpxor	xmm13,xmm13,xmm14
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+rsi]
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm4,xmm12,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm1,xmm11,xmm11
+	vpclmulqdq	xmm12,xmm12,xmm3,0x11
+	vpxor	xmm1,xmm1,xmm11
+	vpxor	xmm12,xmm12,xmm13
+	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
+	vpclmulqdq	xmm9,xmm9,xmm15,0x00
+	vpxor	xmm9,xmm9,xmm2
+
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm5,xmm11,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+rsi]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm2,xmm10,xmm10
+	vpclmulqdq	xmm11,xmm11,xmm0,0x11
+	vpxor	xmm2,xmm2,xmm10
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpxor	xmm11,xmm11,xmm12
+	vpclmulqdq	xmm1,xmm1,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+rsi]
+	vpxor	xmm1,xmm1,xmm9
+
+	vxorps	xmm14,xmm14,xmm7
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm4,xmm10,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+rsi]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpclmulqdq	xmm10,xmm10,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm10,xmm10,xmm11
+	vpclmulqdq	xmm2,xmm2,xmm15,0x00
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm5,xmm8,xmm0,0x00
+	vpclmulqdq	xmm7,xmm8,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm4
+	vpclmulqdq	xmm6,xmm9,xmm15,0x10
+	vpxor	xmm7,xmm7,xmm10
+	vpxor	xmm6,xmm6,xmm2
+
+	vpxor	xmm4,xmm7,xmm5
+	vpxor	xmm6,xmm6,xmm4
+	vpslldq	xmm1,xmm6,8
+	vmovdqu	xmm3,XMMWORD[16+r11]
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm8,xmm5,xmm1
+	vpxor	xmm7,xmm7,xmm6
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm8,xmm8,xmm2
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm2,xmm2,xmm7
+	vpxor	xmm8,xmm8,xmm2
+	mov	r12,QWORD[64+rbp]
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[r12],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-208))+rbp]
+	movaps	xmm7,XMMWORD[((-192))+rbp]
+	movaps	xmm8,XMMWORD[((-176))+rbp]
+	movaps	xmm9,XMMWORD[((-160))+rbp]
+	movaps	xmm10,XMMWORD[((-144))+rbp]
+	movaps	xmm11,XMMWORD[((-128))+rbp]
+	movaps	xmm12,XMMWORD[((-112))+rbp]
+	movaps	xmm13,XMMWORD[((-96))+rbp]
+	movaps	xmm14,XMMWORD[((-80))+rbp]
+	movaps	xmm15,XMMWORD[((-64))+rbp]
+	mov	rdi,QWORD[16+rbp]
+	mov	rsi,QWORD[24+rbp]
+	lea	rsp,[((-40))+rbp]
+
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+$L$gcm_enc_abort:
+	ret
+$L$SEH_end_aesni_gcm_encrypt_22:
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$poly:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$one_msb:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$two_lsb:
+	DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+$L$one_lsb:
+	DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+	DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
+	DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
+	DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+	DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ALIGN	64
+section	.text
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_decrypt_22 wrt ..imagebase
+	DD	$L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_encrypt_22 wrt ..imagebase
+	DD	$L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_aesni_gcm_decrypt_0:
+	DB	1
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	33
+	DB	213
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	100
+	DW	29
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	116
+	DW	28
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	3
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	1
+	DW	21
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	240
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	224
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	208
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	192
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	48
+	DB	$L$SEH_prolog_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1
+	DB	80
+
+$L$SEH_info_aesni_gcm_encrypt_0:
+	DB	1
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	33
+	DB	213
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	100
+	DW	29
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	116
+	DW	28
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	3
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	1
+	DW	21
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	240
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	224
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	208
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	192
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	48
+	DB	$L$SEH_prolog_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1
+	DB	80
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
new file mode 100644
index 0000000..4467604
--- /dev/null
+++ b/gen/bcm/aesni-x86-apple.S
@@ -0,0 +1,2475 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+.align	4
+_aes_hw_encrypt:
+L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L000pic
+L000pic:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L001enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L001enc1_loop_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+.align	4
+_aes_hw_decrypt:
+L_aes_hw_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L002dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L002dec1_loop_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.private_extern	__aesni_encrypt2
+.align	4
+__aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L003enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L003enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.private_extern	__aesni_decrypt2
+.align	4
+__aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L004dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L004dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.private_extern	__aesni_encrypt3
+.align	4
+__aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L005enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L005enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.private_extern	__aesni_decrypt3
+.align	4
+__aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L006dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L006dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.private_extern	__aesni_encrypt4
+.align	4
+__aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+L007enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L007enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.private_extern	__aesni_decrypt4
+.align	4
+__aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+L008dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L008dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.private_extern	__aesni_encrypt6
+.align	4
+__aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	L009_aesni_encrypt6_inner
+.align	4,0x90
+L010enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+L009_aesni_encrypt6_inner:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+L_aesni_encrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L010enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.private_extern	__aesni_decrypt6
+.align	4
+__aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	L011_aesni_decrypt6_inner
+.align	4,0x90
+L012dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+L011_aesni_decrypt6_inner:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+L_aesni_decrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L012dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.globl	_aes_hw_ecb_encrypt
+.private_extern	_aes_hw_ecb_encrypt
+.align	4
+_aes_hw_ecb_encrypt:
+L_aes_hw_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	L013ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	L014ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	L015ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	L016ecb_enc_loop6_enter
+.align	4,0x90
+L017ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+L016ecb_enc_loop6_enter:
+	call	__aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	L017ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	L013ecb_ret
+L015ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	L018ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	L019ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	L020ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	L021ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	__aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L018ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L022enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L022enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L019ecb_enc_two:
+	call	__aesni_encrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L020ecb_enc_three:
+	call	__aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L021ecb_enc_four:
+	call	__aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L014ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	L023ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	L024ecb_dec_loop6_enter
+.align	4,0x90
+L025ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+L024ecb_dec_loop6_enter:
+	call	__aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	L025ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	L013ecb_ret
+L023ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	L026ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	L027ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	L028ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	L029ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	__aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L026ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L030dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L030dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L027ecb_dec_two:
+	call	__aesni_decrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L028ecb_dec_three:
+	call	__aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L029ecb_dec_four:
+	call	__aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+L013ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ccm64_encrypt_blocks
+.private_extern	_aes_hw_ccm64_encrypt_blocks
+.align	4
+_aes_hw_ccm64_encrypt_blocks:
+L_aes_hw_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shll	$4,%ecx
+	movl	$16,%ebx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
+.byte	102,15,56,0,253
+L031ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	xorps	%xmm0,%xmm3
+	movups	32(%ebp),%xmm0
+L032ccm64_enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L032ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+	decl	%eax
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
+	leal	16(%edi),%edi
+	jnz	L031ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ccm64_decrypt_blocks
+.private_extern	_aes_hw_ccm64_decrypt_blocks
+.align	4
+_aes_hw_ccm64_decrypt_blocks:
+L_aes_hw_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L033enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L033enc1_loop_5
+.byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	L034ccm64_dec_outer
+.align	4,0x90
+L034ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	L035ccm64_dec_break
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	32(%ebp),%xmm0
+L036ccm64_dec2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L036ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	jmp	L034ccm64_dec_outer
+.align	4,0x90
+L035ccm64_dec_break:
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+L037enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L037enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+.align	4
+_aes_hw_ctr32_encrypt_blocks:
+L_aes_hw_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L038pic
+L038pic:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	L039ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,195,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,205,0
+	incl	%ebx
+.byte	102,15,58,34,195,1
+	incl	%ebp
+.byte	102,15,58,34,205,1
+	incl	%ebx
+.byte	102,15,58,34,195,2
+	incl	%ebp
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
+	cmpl	$6,%eax
+	jb	L040ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
+	movdqa	%xmm7,32(%esp)
+	movl	%edx,%ebp
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	$6,%eax
+	jmp	L041ctr32_loop6
+.align	4,0x90
+L041ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
+	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
+	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,217
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	64(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	48(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	pshufd	$128,%xmm0,%xmm3
+	subl	$6,%eax
+	jnc	L041ctr32_loop6
+	addl	$6,%eax
+	jz	L042ctr32_ret
+	movdqu	(%ebp),%xmm7
+	movl	%ebp,%edx
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+L040ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	L043ctr32_one
+	pshufd	$64,%xmm0,%xmm4
+	por	%xmm7,%xmm3
+	je	L044ctr32_two
+	pshufd	$192,%xmm1,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	L045ctr32_three
+	pshufd	$128,%xmm1,%xmm6
+	por	%xmm7,%xmm5
+	je	L046ctr32_four
+	por	%xmm7,%xmm6
+	call	__aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L039ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+L043ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L047enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L047enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L044ctr32_two:
+	call	__aesni_encrypt2
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L045ctr32_three:
+	call	__aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L046ctr32_four:
+	call	__aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+L042ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_xts_encrypt
+.private_extern	_aes_hw_xts_encrypt
+.align	4
+_aes_hw_xts_encrypt:
+L_aes_hw_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L048enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L048enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	L049xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	L050xts_enc_loop6
+.align	4,0x90
+L050xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	L050xts_enc_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+L049xts_enc_short:
+	addl	$96,%eax
+	jz	L051xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	L052xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	L053xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	L054xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	L055xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	__aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	L056xts_enc_done
+.align	4,0x90
+L052xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L057enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L057enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L053xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	__aesni_encrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L054xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	__aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L055xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	__aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L051xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	L058xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	L059xts_enc_steal
+.align	4,0x90
+L056xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	L058xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+L059xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	L059xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L060enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L060enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+L058xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_xts_decrypt
+.private_extern	_aes_hw_xts_decrypt
+.align	4
+_aes_hw_xts_decrypt:
+L_aes_hw_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L061enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L061enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	L062xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	L063xts_dec_loop6
+.align	4,0x90
+L063xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	call	L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	L063xts_dec_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+L062xts_dec_short:
+	addl	$96,%eax
+	jz	L064xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	L065xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	L066xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	L067xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	L068xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	__aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	L069xts_dec_done
+.align	4,0x90
+L065xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L070dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L070dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L066xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	__aesni_decrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L067xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	__aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L068xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	__aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L064xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	L071xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	L072xts_dec_only_one_more
+.align	4,0x90
+L069xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	L071xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+L072xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L073dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L073dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+L074xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	L074xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L075dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L075dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+L071xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+.align	4
+_aes_hw_cbc_encrypt:
+L_aes_hw_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	L076cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	L077cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	L078cbc_enc_tail
+	subl	$16,%eax
+	jmp	L079cbc_enc_loop
+.align	4,0x90
+L079cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+L080enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L080enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	L079cbc_enc_loop
+	addl	$16,%eax
+	jnz	L078cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	pxor	%xmm2,%xmm2
+	jmp	L081cbc_ret
+L078cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	L079cbc_enc_loop
+.align	4,0x90
+L077cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	L082cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	L083cbc_dec_loop6_enter
+.align	4,0x90
+L084cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+L083cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	__aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	L084cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	L085cbc_dec_clear_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+L082cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	L086cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	L087cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	L088cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	L089cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	__aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	subl	$80,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L086cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L091dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L091dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L087cbc_dec_two:
+	call	__aesni_decrypt2
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L088cbc_dec_three:
+	call	__aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L089cbc_dec_four:
+	call	__aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	subl	$64,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L085cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+L090cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	L092cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	jmp	L081cbc_ret
+.align	4,0x90
+L092cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+	movdqa	%xmm2,(%esp)
+L081cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
+	movups	%xmm7,(%ebp)
+	pxor	%xmm7,%xmm7
+L076cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.private_extern	__aesni_set_encrypt_key
+.align	4
+__aesni_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
+	testl	%eax,%eax
+	jz	L093bad_pointer
+	testl	%edx,%edx
+	jz	L093bad_pointer
+	call	L094pic
+L094pic:
+	popl	%ebx
+	leal	Lkey_const-L094pic(%ebx),%ebx
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	movl	4(%ebp),%ebp
+	leal	16(%edx),%edx
+	andl	$268437504,%ebp
+	cmpl	$256,%ecx
+	je	L09514rounds
+	cmpl	$192,%ecx
+	je	L09612rounds
+	cmpl	$128,%ecx
+	jne	L097bad_keybits
+.align	4,0x90
+L09810rounds:
+	cmpl	$268435456,%ebp
+	je	L09910rounds_alt
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	L100key_128_cold
+.byte	102,15,58,223,200,2
+	call	L101key_128
+.byte	102,15,58,223,200,4
+	call	L101key_128
+.byte	102,15,58,223,200,8
+	call	L101key_128
+.byte	102,15,58,223,200,16
+	call	L101key_128
+.byte	102,15,58,223,200,32
+	call	L101key_128
+.byte	102,15,58,223,200,64
+	call	L101key_128
+.byte	102,15,58,223,200,128
+	call	L101key_128
+.byte	102,15,58,223,200,27
+	call	L101key_128
+.byte	102,15,58,223,200,54
+	call	L101key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	jmp	L102good_key
+.align	4,0x90
+L101key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+L100key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	4,0x90
+L09910rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+L103loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	L103loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	L102good_key
+.align	4,0x90
+L09612rounds:
+	movq	16(%eax),%xmm2
+	cmpl	$268435456,%ebp
+	je	L10412rounds_alt
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L105key_192a_cold
+.byte	102,15,58,223,202,2
+	call	L106key_192b
+.byte	102,15,58,223,202,4
+	call	L107key_192a
+.byte	102,15,58,223,202,8
+	call	L106key_192b
+.byte	102,15,58,223,202,16
+	call	L107key_192a
+.byte	102,15,58,223,202,32
+	call	L106key_192b
+.byte	102,15,58,223,202,64
+	call	L107key_192a
+.byte	102,15,58,223,202,128
+	call	L106key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	L102good_key
+.align	4,0x90
+L107key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	4,0x90
+L105key_192a_cold:
+	movaps	%xmm2,%xmm5
+L108key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	4,0x90
+L106key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	L108key_192b_warm
+.align	4,0x90
+L10412rounds_alt:
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+L109loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	L109loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	L102good_key
+.align	4,0x90
+L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	cmpl	$268435456,%ebp
+	je	L11014rounds_alt
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L111key_256a_cold
+.byte	102,15,58,223,200,1
+	call	L112key_256b
+.byte	102,15,58,223,202,2
+	call	L113key_256a
+.byte	102,15,58,223,200,2
+	call	L112key_256b
+.byte	102,15,58,223,202,4
+	call	L113key_256a
+.byte	102,15,58,223,200,4
+	call	L112key_256b
+.byte	102,15,58,223,202,8
+	call	L113key_256a
+.byte	102,15,58,223,200,8
+	call	L112key_256b
+.byte	102,15,58,223,202,16
+	call	L113key_256a
+.byte	102,15,58,223,200,16
+	call	L112key_256b
+.byte	102,15,58,223,202,32
+	call	L113key_256a
+.byte	102,15,58,223,200,32
+	call	L112key_256b
+.byte	102,15,58,223,202,64
+	call	L113key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	L102good_key
+.align	4,0x90
+L113key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+L111key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	4,0x90
+L112key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.align	4,0x90
+L11014rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+L114loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	L115done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	L114loop_key256
+L115done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+L102good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	2,0x90
+L093bad_pointer:
+	movl	$-1,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	2,0x90
+L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+.align	4
+_aes_hw_set_encrypt_key:
+L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L116pic
+L116pic:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+3-L116pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	__aesni_set_encrypt_key
+	ret
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+.align	4
+_aes_hw_set_decrypt_key:
+L_aes_hw_set_decrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	__aesni_set_encrypt_key
+	movl	12(%esp),%edx
+	shll	$4,%ecx
+	testl	%eax,%eax
+	jnz	L117dec_key_ret
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+L118dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	L118dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorl	%eax,%eax
+L117dec_key_ret:
+	ret
+.align	6,0x90
+Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol	_OPENSSL_ia32cap_P
+.long	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
new file mode 100644
index 0000000..54daf18
--- /dev/null
+++ b/gen/bcm/aesni-x86-linux.S
@@ -0,0 +1,2511 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,@function
+.align	16
+aes_hw_encrypt:
+.L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L000pic
+.L000pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L001enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L001enc1_loop_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.size	aes_hw_encrypt,.-.L_aes_hw_encrypt_begin
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,@function
+.align	16
+aes_hw_decrypt:
+.L_aes_hw_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L002dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L002dec1_loop_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.size	aes_hw_decrypt,.-.L_aes_hw_decrypt_begin
+.hidden	_aesni_encrypt2
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L003enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L003enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.hidden	_aesni_decrypt2
+.type	_aesni_decrypt2,@function
+.align	16
+_aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L004dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L004dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.size	_aesni_decrypt2,.-_aesni_decrypt2
+.hidden	_aesni_encrypt3
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L005enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L005enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.hidden	_aesni_decrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L006dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L006dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.hidden	_aesni_encrypt4
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L007enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L007enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.hidden	_aesni_decrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L008dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L008dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.hidden	_aesni_encrypt6
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L009_aesni_encrypt6_inner
+.align	16
+.L010enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.L009_aesni_encrypt6_inner:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.L_aesni_encrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L010enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.hidden	_aesni_decrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L011_aesni_decrypt6_inner
+.align	16
+.L012dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.L011_aesni_decrypt6_inner:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.L_aesni_decrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L012dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.globl	aes_hw_ecb_encrypt
+.hidden	aes_hw_ecb_encrypt
+.type	aes_hw_ecb_encrypt,@function
+.align	16
+aes_hw_ecb_encrypt:
+.L_aes_hw_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	.L013ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	.L014ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L015ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L016ecb_enc_loop6_enter
+.align	16
+.L017ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L016ecb_enc_loop6_enter:
+	call	_aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L017ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L013ecb_ret
+.L015ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L018ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	.L019ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L020ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	.L021ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L018ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L022enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L022enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L019ecb_enc_two:
+	call	_aesni_encrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L020ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L021ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L014ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L023ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L024ecb_dec_loop6_enter
+.align	16
+.L025ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L024ecb_dec_loop6_enter:
+	call	_aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L025ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L013ecb_ret
+.L023ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L026ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	.L027ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L028ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	.L029ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L026ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L030dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L030dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L027ecb_dec_two:
+	call	_aesni_decrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L028ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L029ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L013ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin
+.globl	aes_hw_ccm64_encrypt_blocks
+.hidden	aes_hw_ccm64_encrypt_blocks
+.type	aes_hw_ccm64_encrypt_blocks,@function
+.align	16
+aes_hw_ccm64_encrypt_blocks:
+.L_aes_hw_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shll	$4,%ecx
+	movl	$16,%ebx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
+.byte	102,15,56,0,253
+.L031ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	xorps	%xmm0,%xmm3
+	movups	32(%ebp),%xmm0
+.L032ccm64_enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L032ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+	decl	%eax
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
+	leal	16(%edi),%edi
+	jnz	.L031ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin
+.globl	aes_hw_ccm64_decrypt_blocks
+.hidden	aes_hw_ccm64_decrypt_blocks
+.type	aes_hw_ccm64_decrypt_blocks,@function
+.align	16
+aes_hw_ccm64_decrypt_blocks:
+.L_aes_hw_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L033enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L033enc1_loop_5
+.byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	.L034ccm64_dec_outer
+.align	16
+.L034ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	.L035ccm64_dec_break
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	32(%ebp),%xmm0
+.L036ccm64_dec2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L036ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	jmp	.L034ccm64_dec_outer
+.align	16
+.L035ccm64_dec_break:
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+.L037enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L037enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.L_aes_hw_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L038pic
+.L038pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	.L039ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,195,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,205,0
+	incl	%ebx
+.byte	102,15,58,34,195,1
+	incl	%ebp
+.byte	102,15,58,34,205,1
+	incl	%ebx
+.byte	102,15,58,34,195,2
+	incl	%ebp
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
+	cmpl	$6,%eax
+	jb	.L040ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
+	movdqa	%xmm7,32(%esp)
+	movl	%edx,%ebp
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	$6,%eax
+	jmp	.L041ctr32_loop6
+.align	16
+.L041ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
+	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
+	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,217
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	64(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	48(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	pshufd	$128,%xmm0,%xmm3
+	subl	$6,%eax
+	jnc	.L041ctr32_loop6
+	addl	$6,%eax
+	jz	.L042ctr32_ret
+	movdqu	(%ebp),%xmm7
+	movl	%ebp,%edx
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+.L040ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	.L043ctr32_one
+	pshufd	$64,%xmm0,%xmm4
+	por	%xmm7,%xmm3
+	je	.L044ctr32_two
+	pshufd	$192,%xmm1,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	.L045ctr32_three
+	pshufd	$128,%xmm1,%xmm6
+	por	%xmm7,%xmm5
+	je	.L046ctr32_four
+	por	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L039ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+.L043ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L047enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L047enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L044ctr32_two:
+	call	_aesni_encrypt2
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L045ctr32_three:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L046ctr32_four:
+	call	_aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L042ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin
+.globl	aes_hw_xts_encrypt
+.hidden	aes_hw_xts_encrypt
+.type	aes_hw_xts_encrypt,@function
+.align	16
+aes_hw_xts_encrypt:
+.L_aes_hw_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L048enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L048enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	.L049xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L050xts_enc_loop6
+.align	16
+.L050xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L050xts_enc_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L049xts_enc_short:
+	addl	$96,%eax
+	jz	.L051xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L052xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L053xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L054xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L055xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L056xts_enc_done
+.align	16
+.L052xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L057enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L057enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L053xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_encrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L054xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L055xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L051xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L058xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	.L059xts_enc_steal
+.align	16
+.L056xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L058xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+.L059xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L059xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L060enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L060enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+.L058xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin
+.globl	aes_hw_xts_decrypt
+.hidden	aes_hw_xts_decrypt
+.type	aes_hw_xts_decrypt,@function
+.align	16
+aes_hw_xts_decrypt:
+.L_aes_hw_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L061enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L061enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	.L062xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L063xts_dec_loop6
+.align	16
+.L063xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	call	.L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L063xts_dec_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L062xts_dec_short:
+	addl	$96,%eax
+	jz	.L064xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L065xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L066xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L067xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L068xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L069xts_dec_done
+.align	16
+.L065xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L070dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L070dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L066xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_decrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L067xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L068xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L064xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L071xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	.L072xts_dec_only_one_more
+.align	16
+.L069xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L071xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+.L072xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L073dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L073dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+.L074xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L074xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L075dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L075dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+.L071xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,@function
+.align	16
+aes_hw_cbc_encrypt:
+.L_aes_hw_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	.L076cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	.L077cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	.L078cbc_enc_tail
+	subl	$16,%eax
+	jmp	.L079cbc_enc_loop
+.align	16
+.L079cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+.L080enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L080enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	.L079cbc_enc_loop
+	addl	$16,%eax
+	jnz	.L078cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	pxor	%xmm2,%xmm2
+	jmp	.L081cbc_ret
+.L078cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	.L079cbc_enc_loop
+.align	16
+.L077cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	.L082cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	.L083cbc_dec_loop6_enter
+.align	16
+.L084cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+.L083cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	.L084cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	.L085cbc_dec_clear_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+.L082cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	.L086cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	.L087cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	.L088cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	.L089cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	subl	$80,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L086cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L091dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L091dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L087cbc_dec_two:
+	call	_aesni_decrypt2
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L088cbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L089cbc_dec_four:
+	call	_aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	subl	$64,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L085cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+.L090cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	.L092cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	jmp	.L081cbc_ret
+.align	16
+.L092cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+	movdqa	%xmm2,(%esp)
+.L081cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
+	movups	%xmm7,(%ebp)
+	pxor	%xmm7,%xmm7
+.L076cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
+.hidden	_aesni_set_encrypt_key
+.type	_aesni_set_encrypt_key,@function
+.align	16
+_aesni_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
+	testl	%eax,%eax
+	jz	.L093bad_pointer
+	testl	%edx,%edx
+	jz	.L093bad_pointer
+	call	.L094pic
+.L094pic:
+	popl	%ebx
+	leal	.Lkey_const-.L094pic(%ebx),%ebx
+	leal	OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	movl	4(%ebp),%ebp
+	leal	16(%edx),%edx
+	andl	$268437504,%ebp
+	cmpl	$256,%ecx
+	je	.L09514rounds
+	cmpl	$192,%ecx
+	je	.L09612rounds
+	cmpl	$128,%ecx
+	jne	.L097bad_keybits
+.align	16
+.L09810rounds:
+	cmpl	$268435456,%ebp
+	je	.L09910rounds_alt
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	.L100key_128_cold
+.byte	102,15,58,223,200,2
+	call	.L101key_128
+.byte	102,15,58,223,200,4
+	call	.L101key_128
+.byte	102,15,58,223,200,8
+	call	.L101key_128
+.byte	102,15,58,223,200,16
+	call	.L101key_128
+.byte	102,15,58,223,200,32
+	call	.L101key_128
+.byte	102,15,58,223,200,64
+	call	.L101key_128
+.byte	102,15,58,223,200,128
+	call	.L101key_128
+.byte	102,15,58,223,200,27
+	call	.L101key_128
+.byte	102,15,58,223,200,54
+	call	.L101key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	jmp	.L102good_key
+.align	16
+.L101key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.L100key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L09910rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L103loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L103loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L102good_key
+.align	16
+.L09612rounds:
+	movq	16(%eax),%xmm2
+	cmpl	$268435456,%ebp
+	je	.L10412rounds_alt
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L105key_192a_cold
+.byte	102,15,58,223,202,2
+	call	.L106key_192b
+.byte	102,15,58,223,202,4
+	call	.L107key_192a
+.byte	102,15,58,223,202,8
+	call	.L106key_192b
+.byte	102,15,58,223,202,16
+	call	.L107key_192a
+.byte	102,15,58,223,202,32
+	call	.L106key_192b
+.byte	102,15,58,223,202,64
+	call	.L107key_192a
+.byte	102,15,58,223,202,128
+	call	.L106key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	.L102good_key
+.align	16
+.L107key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	16
+.L105key_192a_cold:
+	movaps	%xmm2,%xmm5
+.L108key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	16
+.L106key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	.L108key_192b_warm
+.align	16
+.L10412rounds_alt:
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+.L109loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	.L109loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	.L102good_key
+.align	16
+.L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	cmpl	$268435456,%ebp
+	je	.L11014rounds_alt
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L111key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L112key_256b
+.byte	102,15,58,223,202,2
+	call	.L113key_256a
+.byte	102,15,58,223,200,2
+	call	.L112key_256b
+.byte	102,15,58,223,202,4
+	call	.L113key_256a
+.byte	102,15,58,223,200,4
+	call	.L112key_256b
+.byte	102,15,58,223,202,8
+	call	.L113key_256a
+.byte	102,15,58,223,200,8
+	call	.L112key_256b
+.byte	102,15,58,223,202,16
+	call	.L113key_256a
+.byte	102,15,58,223,200,16
+	call	.L112key_256b
+.byte	102,15,58,223,202,32
+	call	.L113key_256a
+.byte	102,15,58,223,200,32
+	call	.L112key_256b
+.byte	102,15,58,223,202,64
+	call	.L113key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	.L102good_key
+.align	16
+.L113key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L111key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L112key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.align	16
+.L11014rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+.L114loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	.L115done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	.L114loop_key256
+.L115done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+.L102good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4
+.L093bad_pointer:
+	movl	$-1,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4
+.L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,@function
+.align	16
+aes_hw_set_encrypt_key:
+.L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L116pic
+.L116pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	ret
+.size	aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,@function
+.align	16
+aes_hw_set_decrypt_key:
+.L_aes_hw_set_decrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	movl	12(%esp),%edx
+	shll	$4,%ecx
+	testl	%eax,%eax
+	jnz	.L117dec_key_ret
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+.L118dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	.L118dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorl	%eax,%eax
+.L117dec_key_ret:
+	ret
+.size	aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.align	64
+.Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
new file mode 100644
index 0000000..19b1d98
--- /dev/null
+++ b/gen/bcm/aesni-x86-win.asm
@@ -0,0 +1,2466 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_OPENSSL_ia32cap_P
+%ifdef BORINGSSL_DISPATCH_TEST
+extern	_BORINGSSL_function_hit
+%endif
+global	_aes_hw_encrypt
+align	16
+_aes_hw_encrypt:
+L$_aes_hw_encrypt_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$000pic
+L$000pic:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [12+esp]
+	movups	xmm2,[eax]
+	mov	ecx,DWORD [240+edx]
+	mov	eax,DWORD [8+esp]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$001enc1_loop_1:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$001enc1_loop_1
+db	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	[eax],xmm2
+	pxor	xmm2,xmm2
+	ret
+global	_aes_hw_decrypt
+align	16
+_aes_hw_decrypt:
+L$_aes_hw_decrypt_begin:
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [12+esp]
+	movups	xmm2,[eax]
+	mov	ecx,DWORD [240+edx]
+	mov	eax,DWORD [8+esp]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$002dec1_loop_2:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$002dec1_loop_2
+db	102,15,56,223,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	[eax],xmm2
+	pxor	xmm2,xmm2
+	ret
+align	16
+__aesni_encrypt2:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$003enc2_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$003enc2_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,221,208
+db	102,15,56,221,216
+	ret
+align	16
+__aesni_decrypt2:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$004dec2_loop:
+db	102,15,56,222,209
+db	102,15,56,222,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,222,208
+db	102,15,56,222,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$004dec2_loop
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,223,208
+db	102,15,56,223,216
+	ret
+align	16
+__aesni_encrypt3:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$005enc3_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$005enc3_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+	ret
+align	16
+__aesni_decrypt3:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$006dec3_loop:
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,222,208
+db	102,15,56,222,216
+db	102,15,56,222,224
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$006dec3_loop
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+db	102,15,56,223,208
+db	102,15,56,223,216
+db	102,15,56,223,224
+	ret
+align	16
+__aesni_encrypt4:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	shl	ecx,4
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	15,31,64,0
+	add	ecx,16
+L$007enc4_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$007enc4_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+	ret
+align	16
+__aesni_decrypt4:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	shl	ecx,4
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	15,31,64,0
+	add	ecx,16
+L$008dec4_loop:
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+db	102,15,56,222,233
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,222,208
+db	102,15,56,222,216
+db	102,15,56,222,224
+db	102,15,56,222,232
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$008dec4_loop
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+db	102,15,56,222,233
+db	102,15,56,223,208
+db	102,15,56,223,216
+db	102,15,56,223,224
+db	102,15,56,223,232
+	ret
+align	16
+__aesni_encrypt6:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+db	102,15,56,220,209
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+db	102,15,56,220,217
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,[ecx*1+edx]
+	add	ecx,16
+	jmp	NEAR L$009_aesni_encrypt6_inner
+align	16
+L$010enc6_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+L$009_aesni_encrypt6_inner:
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+L$_aesni_encrypt6_enter:
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+db	102,15,56,220,240
+db	102,15,56,220,248
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$010enc6_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+db	102,15,56,221,240
+db	102,15,56,221,248
+	ret
+align	16
+__aesni_decrypt6:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+db	102,15,56,222,209
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+db	102,15,56,222,217
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	102,15,56,222,225
+	pxor	xmm7,xmm0
+	movups	xmm0,[ecx*1+edx]
+	add	ecx,16
+	jmp	NEAR L$011_aesni_decrypt6_inner
+align	16
+L$012dec6_loop:
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+L$011_aesni_decrypt6_inner:
+db	102,15,56,222,233
+db	102,15,56,222,241
+db	102,15,56,222,249
+L$_aesni_decrypt6_enter:
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,222,208
+db	102,15,56,222,216
+db	102,15,56,222,224
+db	102,15,56,222,232
+db	102,15,56,222,240
+db	102,15,56,222,248
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$012dec6_loop
+db	102,15,56,222,209
+db	102,15,56,222,217
+db	102,15,56,222,225
+db	102,15,56,222,233
+db	102,15,56,222,241
+db	102,15,56,222,249
+db	102,15,56,223,208
+db	102,15,56,223,216
+db	102,15,56,223,224
+db	102,15,56,223,232
+db	102,15,56,223,240
+db	102,15,56,223,248
+	ret
+global	_aes_hw_ecb_encrypt
+align	16
+_aes_hw_ecb_encrypt:
+L$_aes_hw_ecb_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	and	eax,-16
+	jz	NEAR L$013ecb_ret
+	mov	ecx,DWORD [240+edx]
+	test	ebx,ebx
+	jz	NEAR L$014ecb_decrypt
+	mov	ebp,edx
+	mov	ebx,ecx
+	cmp	eax,96
+	jb	NEAR L$015ecb_enc_tail
+	movdqu	xmm2,[esi]
+	movdqu	xmm3,[16+esi]
+	movdqu	xmm4,[32+esi]
+	movdqu	xmm5,[48+esi]
+	movdqu	xmm6,[64+esi]
+	movdqu	xmm7,[80+esi]
+	lea	esi,[96+esi]
+	sub	eax,96
+	jmp	NEAR L$016ecb_enc_loop6_enter
+align	16
+L$017ecb_enc_loop6:
+	movups	[edi],xmm2
+	movdqu	xmm2,[esi]
+	movups	[16+edi],xmm3
+	movdqu	xmm3,[16+esi]
+	movups	[32+edi],xmm4
+	movdqu	xmm4,[32+esi]
+	movups	[48+edi],xmm5
+	movdqu	xmm5,[48+esi]
+	movups	[64+edi],xmm6
+	movdqu	xmm6,[64+esi]
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	movdqu	xmm7,[80+esi]
+	lea	esi,[96+esi]
+L$016ecb_enc_loop6_enter:
+	call	__aesni_encrypt6
+	mov	edx,ebp
+	mov	ecx,ebx
+	sub	eax,96
+	jnc	NEAR L$017ecb_enc_loop6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	add	eax,96
+	jz	NEAR L$013ecb_ret
+L$015ecb_enc_tail:
+	movups	xmm2,[esi]
+	cmp	eax,32
+	jb	NEAR L$018ecb_enc_one
+	movups	xmm3,[16+esi]
+	je	NEAR L$019ecb_enc_two
+	movups	xmm4,[32+esi]
+	cmp	eax,64
+	jb	NEAR L$020ecb_enc_three
+	movups	xmm5,[48+esi]
+	je	NEAR L$021ecb_enc_four
+	movups	xmm6,[64+esi]
+	xorps	xmm7,xmm7
+	call	__aesni_encrypt6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	jmp	NEAR L$013ecb_ret
+align	16
+L$018ecb_enc_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$022enc1_loop_3:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$022enc1_loop_3
+db	102,15,56,221,209
+	movups	[edi],xmm2
+	jmp	NEAR L$013ecb_ret
+align	16
+L$019ecb_enc_two:
+	call	__aesni_encrypt2
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	jmp	NEAR L$013ecb_ret
+align	16
+L$020ecb_enc_three:
+	call	__aesni_encrypt3
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	jmp	NEAR L$013ecb_ret
+align	16
+L$021ecb_enc_four:
+	call	__aesni_encrypt4
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	jmp	NEAR L$013ecb_ret
+align	16
+L$014ecb_decrypt:
+	mov	ebp,edx
+	mov	ebx,ecx
+	cmp	eax,96
+	jb	NEAR L$023ecb_dec_tail
+	movdqu	xmm2,[esi]
+	movdqu	xmm3,[16+esi]
+	movdqu	xmm4,[32+esi]
+	movdqu	xmm5,[48+esi]
+	movdqu	xmm6,[64+esi]
+	movdqu	xmm7,[80+esi]
+	lea	esi,[96+esi]
+	sub	eax,96
+	jmp	NEAR L$024ecb_dec_loop6_enter
+align	16
+L$025ecb_dec_loop6:
+	movups	[edi],xmm2
+	movdqu	xmm2,[esi]
+	movups	[16+edi],xmm3
+	movdqu	xmm3,[16+esi]
+	movups	[32+edi],xmm4
+	movdqu	xmm4,[32+esi]
+	movups	[48+edi],xmm5
+	movdqu	xmm5,[48+esi]
+	movups	[64+edi],xmm6
+	movdqu	xmm6,[64+esi]
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	movdqu	xmm7,[80+esi]
+	lea	esi,[96+esi]
+L$024ecb_dec_loop6_enter:
+	call	__aesni_decrypt6
+	mov	edx,ebp
+	mov	ecx,ebx
+	sub	eax,96
+	jnc	NEAR L$025ecb_dec_loop6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	add	eax,96
+	jz	NEAR L$013ecb_ret
+L$023ecb_dec_tail:
+	movups	xmm2,[esi]
+	cmp	eax,32
+	jb	NEAR L$026ecb_dec_one
+	movups	xmm3,[16+esi]
+	je	NEAR L$027ecb_dec_two
+	movups	xmm4,[32+esi]
+	cmp	eax,64
+	jb	NEAR L$028ecb_dec_three
+	movups	xmm5,[48+esi]
+	je	NEAR L$029ecb_dec_four
+	movups	xmm6,[64+esi]
+	xorps	xmm7,xmm7
+	call	__aesni_decrypt6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	jmp	NEAR L$013ecb_ret
+align	16
+L$026ecb_dec_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$030dec1_loop_4:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$030dec1_loop_4
+db	102,15,56,223,209
+	movups	[edi],xmm2
+	jmp	NEAR L$013ecb_ret
+align	16
+L$027ecb_dec_two:
+	call	__aesni_decrypt2
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	jmp	NEAR L$013ecb_ret
+align	16
+L$028ecb_dec_three:
+	call	__aesni_decrypt3
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	jmp	NEAR L$013ecb_ret
+align	16
+L$029ecb_dec_four:
+	call	__aesni_decrypt4
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+L$013ecb_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_ccm64_encrypt_blocks
+align	16
+_aes_hw_ccm64_encrypt_blocks:
+L$_aes_hw_ccm64_encrypt_blocks_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	ebp,esp
+	sub	esp,60
+	and	esp,-16
+	mov	DWORD [48+esp],ebp
+	movdqu	xmm7,[ebx]
+	movdqu	xmm3,[ecx]
+	mov	ecx,DWORD [240+edx]
+	mov	DWORD [esp],202182159
+	mov	DWORD [4+esp],134810123
+	mov	DWORD [8+esp],67438087
+	mov	DWORD [12+esp],66051
+	mov	ebx,1
+	xor	ebp,ebp
+	mov	DWORD [16+esp],ebx
+	mov	DWORD [20+esp],ebp
+	mov	DWORD [24+esp],ebp
+	mov	DWORD [28+esp],ebp
+	shl	ecx,4
+	mov	ebx,16
+	lea	ebp,[edx]
+	movdqa	xmm5,[esp]
+	movdqa	xmm2,xmm7
+	lea	edx,[32+ecx*1+edx]
+	sub	ebx,ecx
+db	102,15,56,0,253
+L$031ccm64_enc_outer:
+	movups	xmm0,[ebp]
+	mov	ecx,ebx
+	movups	xmm6,[esi]
+	xorps	xmm2,xmm0
+	movups	xmm1,[16+ebp]
+	xorps	xmm0,xmm6
+	xorps	xmm3,xmm0
+	movups	xmm0,[32+ebp]
+L$032ccm64_enc2_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$032ccm64_enc2_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+	paddq	xmm7,[16+esp]
+	dec	eax
+db	102,15,56,221,208
+db	102,15,56,221,216
+	lea	esi,[16+esi]
+	xorps	xmm6,xmm2
+	movdqa	xmm2,xmm7
+	movups	[edi],xmm6
+db	102,15,56,0,213
+	lea	edi,[16+edi]
+	jnz	NEAR L$031ccm64_enc_outer
+	mov	esp,DWORD [48+esp]
+	mov	edi,DWORD [40+esp]
+	movups	[edi],xmm3
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_ccm64_decrypt_blocks
+align	16
+_aes_hw_ccm64_decrypt_blocks:
+L$_aes_hw_ccm64_decrypt_blocks_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	ebp,esp
+	sub	esp,60
+	and	esp,-16
+	mov	DWORD [48+esp],ebp
+	movdqu	xmm7,[ebx]
+	movdqu	xmm3,[ecx]
+	mov	ecx,DWORD [240+edx]
+	mov	DWORD [esp],202182159
+	mov	DWORD [4+esp],134810123
+	mov	DWORD [8+esp],67438087
+	mov	DWORD [12+esp],66051
+	mov	ebx,1
+	xor	ebp,ebp
+	mov	DWORD [16+esp],ebx
+	mov	DWORD [20+esp],ebp
+	mov	DWORD [24+esp],ebp
+	mov	DWORD [28+esp],ebp
+	movdqa	xmm5,[esp]
+	movdqa	xmm2,xmm7
+	mov	ebp,edx
+	mov	ebx,ecx
+db	102,15,56,0,253
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$033enc1_loop_5:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$033enc1_loop_5
+db	102,15,56,221,209
+	shl	ebx,4
+	mov	ecx,16
+	movups	xmm6,[esi]
+	paddq	xmm7,[16+esp]
+	lea	esi,[16+esi]
+	sub	ecx,ebx
+	lea	edx,[32+ebx*1+ebp]
+	mov	ebx,ecx
+	jmp	NEAR L$034ccm64_dec_outer
+align	16
+L$034ccm64_dec_outer:
+	xorps	xmm6,xmm2
+	movdqa	xmm2,xmm7
+	movups	[edi],xmm6
+	lea	edi,[16+edi]
+db	102,15,56,0,213
+	sub	eax,1
+	jz	NEAR L$035ccm64_dec_break
+	movups	xmm0,[ebp]
+	mov	ecx,ebx
+	movups	xmm1,[16+ebp]
+	xorps	xmm6,xmm0
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm6
+	movups	xmm0,[32+ebp]
+L$036ccm64_dec2_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$036ccm64_dec2_loop
+	movups	xmm6,[esi]
+	paddq	xmm7,[16+esp]
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,221,208
+db	102,15,56,221,216
+	lea	esi,[16+esi]
+	jmp	NEAR L$034ccm64_dec_outer
+align	16
+L$035ccm64_dec_break:
+	mov	ecx,DWORD [240+ebp]
+	mov	edx,ebp
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	xorps	xmm6,xmm0
+	lea	edx,[32+edx]
+	xorps	xmm3,xmm6
+L$037enc1_loop_6:
+db	102,15,56,220,217
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$037enc1_loop_6
+db	102,15,56,221,217
+	mov	esp,DWORD [48+esp]
+	mov	edi,DWORD [40+esp]
+	movups	[edi],xmm3
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_ctr32_encrypt_blocks
+align	16
+_aes_hw_ctr32_encrypt_blocks:
+L$_aes_hw_ctr32_encrypt_blocks_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$038pic
+L$038pic:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,88
+	and	esp,-16
+	mov	DWORD [80+esp],ebp
+	cmp	eax,1
+	je	NEAR L$039ctr32_one_shortcut
+	movdqu	xmm7,[ebx]
+	mov	DWORD [esp],202182159
+	mov	DWORD [4+esp],134810123
+	mov	DWORD [8+esp],67438087
+	mov	DWORD [12+esp],66051
+	mov	ecx,6
+	xor	ebp,ebp
+	mov	DWORD [16+esp],ecx
+	mov	DWORD [20+esp],ecx
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],ebp
+db	102,15,58,22,251,3
+db	102,15,58,34,253,3
+	mov	ecx,DWORD [240+edx]
+	bswap	ebx
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movdqa	xmm2,[esp]
+db	102,15,58,34,195,0
+	lea	ebp,[3+ebx]
+db	102,15,58,34,205,0
+	inc	ebx
+db	102,15,58,34,195,1
+	inc	ebp
+db	102,15,58,34,205,1
+	inc	ebx
+db	102,15,58,34,195,2
+	inc	ebp
+db	102,15,58,34,205,2
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	movdqu	xmm6,[edx]
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	pshufd	xmm2,xmm0,192
+	pshufd	xmm3,xmm0,128
+	cmp	eax,6
+	jb	NEAR L$040ctr32_tail
+	pxor	xmm7,xmm6
+	shl	ecx,4
+	mov	ebx,16
+	movdqa	[32+esp],xmm7
+	mov	ebp,edx
+	sub	ebx,ecx
+	lea	edx,[32+ecx*1+edx]
+	sub	eax,6
+	jmp	NEAR L$041ctr32_loop6
+align	16
+L$041ctr32_loop6:
+	pshufd	xmm4,xmm0,64
+	movdqa	xmm0,[32+esp]
+	pshufd	xmm5,xmm1,192
+	pxor	xmm2,xmm0
+	pshufd	xmm6,xmm1,128
+	pxor	xmm3,xmm0
+	pshufd	xmm7,xmm1,64
+	movups	xmm1,[16+ebp]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+db	102,15,56,220,209
+	pxor	xmm6,xmm0
+	pxor	xmm7,xmm0
+db	102,15,56,220,217
+	movups	xmm0,[32+ebp]
+	mov	ecx,ebx
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+	call	L$_aesni_encrypt6_enter
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	[edi],xmm2
+	movdqa	xmm0,[16+esp]
+	xorps	xmm4,xmm1
+	movdqa	xmm1,[64+esp]
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	paddd	xmm1,xmm0
+	paddd	xmm0,[48+esp]
+	movdqa	xmm2,[esp]
+	movups	xmm3,[48+esi]
+	movups	xmm4,[64+esi]
+	xorps	xmm5,xmm3
+	movups	xmm3,[80+esi]
+	lea	esi,[96+esi]
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	xorps	xmm6,xmm4
+	movups	[48+edi],xmm5
+	xorps	xmm7,xmm3
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	movups	[64+edi],xmm6
+	pshufd	xmm2,xmm0,192
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	pshufd	xmm3,xmm0,128
+	sub	eax,6
+	jnc	NEAR L$041ctr32_loop6
+	add	eax,6
+	jz	NEAR L$042ctr32_ret
+	movdqu	xmm7,[ebp]
+	mov	edx,ebp
+	pxor	xmm7,[32+esp]
+	mov	ecx,DWORD [240+ebp]
+L$040ctr32_tail:
+	por	xmm2,xmm7
+	cmp	eax,2
+	jb	NEAR L$043ctr32_one
+	pshufd	xmm4,xmm0,64
+	por	xmm3,xmm7
+	je	NEAR L$044ctr32_two
+	pshufd	xmm5,xmm1,192
+	por	xmm4,xmm7
+	cmp	eax,4
+	jb	NEAR L$045ctr32_three
+	pshufd	xmm6,xmm1,128
+	por	xmm5,xmm7
+	je	NEAR L$046ctr32_four
+	por	xmm6,xmm7
+	call	__aesni_encrypt6
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	xmm0,[48+esi]
+	xorps	xmm4,xmm1
+	movups	xmm1,[64+esi]
+	xorps	xmm5,xmm0
+	movups	[edi],xmm2
+	xorps	xmm6,xmm1
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	jmp	NEAR L$042ctr32_ret
+align	16
+L$039ctr32_one_shortcut:
+	movups	xmm2,[ebx]
+	mov	ecx,DWORD [240+edx]
+L$043ctr32_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$047enc1_loop_7:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$047enc1_loop_7
+db	102,15,56,221,209
+	movups	xmm6,[esi]
+	xorps	xmm6,xmm2
+	movups	[edi],xmm6
+	jmp	NEAR L$042ctr32_ret
+align	16
+L$044ctr32_two:
+	call	__aesni_encrypt2
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	jmp	NEAR L$042ctr32_ret
+align	16
+L$045ctr32_three:
+	call	__aesni_encrypt3
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	movups	xmm7,[32+esi]
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	xorps	xmm4,xmm7
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	jmp	NEAR L$042ctr32_ret
+align	16
+L$046ctr32_four:
+	call	__aesni_encrypt4
+	movups	xmm6,[esi]
+	movups	xmm7,[16+esi]
+	movups	xmm1,[32+esi]
+	xorps	xmm2,xmm6
+	movups	xmm0,[48+esi]
+	xorps	xmm3,xmm7
+	movups	[edi],xmm2
+	xorps	xmm4,xmm1
+	movups	[16+edi],xmm3
+	xorps	xmm5,xmm0
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+L$042ctr32_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	movdqa	[32+esp],xmm0
+	pxor	xmm5,xmm5
+	movdqa	[48+esp],xmm0
+	pxor	xmm6,xmm6
+	movdqa	[64+esp],xmm0
+	pxor	xmm7,xmm7
+	mov	esp,DWORD [80+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_xts_encrypt
+align	16
+_aes_hw_xts_encrypt:
+L$_aes_hw_xts_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	edx,DWORD [36+esp]
+	mov	esi,DWORD [40+esp]
+	mov	ecx,DWORD [240+edx]
+	movups	xmm2,[esi]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$048enc1_loop_8:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$048enc1_loop_8
+db	102,15,56,221,209
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebp,esp
+	sub	esp,120
+	mov	ecx,DWORD [240+edx]
+	and	esp,-16
+	mov	DWORD [96+esp],135
+	mov	DWORD [100+esp],0
+	mov	DWORD [104+esp],1
+	mov	DWORD [108+esp],0
+	mov	DWORD [112+esp],eax
+	mov	DWORD [116+esp],ebp
+	movdqa	xmm1,xmm2
+	pxor	xmm0,xmm0
+	movdqa	xmm3,[96+esp]
+	pcmpgtd	xmm0,xmm1
+	and	eax,-16
+	mov	ebp,edx
+	mov	ebx,ecx
+	sub	eax,96
+	jc	NEAR L$049xts_enc_short
+	shl	ecx,4
+	mov	ebx,16
+	sub	ebx,ecx
+	lea	edx,[32+ecx*1+edx]
+	jmp	NEAR L$050xts_enc_loop6
+align	16
+L$050xts_enc_loop6:
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[16+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[32+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[48+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm7,xmm0,19
+	movdqa	[64+esp],xmm1
+	paddq	xmm1,xmm1
+	movups	xmm0,[ebp]
+	pand	xmm7,xmm3
+	movups	xmm2,[esi]
+	pxor	xmm7,xmm1
+	mov	ecx,ebx
+	movdqu	xmm3,[16+esi]
+	xorps	xmm2,xmm0
+	movdqu	xmm4,[32+esi]
+	pxor	xmm3,xmm0
+	movdqu	xmm5,[48+esi]
+	pxor	xmm4,xmm0
+	movdqu	xmm6,[64+esi]
+	pxor	xmm5,xmm0
+	movdqu	xmm1,[80+esi]
+	pxor	xmm6,xmm0
+	lea	esi,[96+esi]
+	pxor	xmm2,[esp]
+	movdqa	[80+esp],xmm7
+	pxor	xmm7,xmm1
+	movups	xmm1,[16+ebp]
+	pxor	xmm3,[16+esp]
+	pxor	xmm4,[32+esp]
+db	102,15,56,220,209
+	pxor	xmm5,[48+esp]
+	pxor	xmm6,[64+esp]
+db	102,15,56,220,217
+	pxor	xmm7,xmm0
+	movups	xmm0,[32+ebp]
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+	call	L$_aesni_encrypt6_enter
+	movdqa	xmm1,[80+esp]
+	pxor	xmm0,xmm0
+	xorps	xmm2,[esp]
+	pcmpgtd	xmm0,xmm1
+	xorps	xmm3,[16+esp]
+	movups	[edi],xmm2
+	xorps	xmm4,[32+esp]
+	movups	[16+edi],xmm3
+	xorps	xmm5,[48+esp]
+	movups	[32+edi],xmm4
+	xorps	xmm6,[64+esp]
+	movups	[48+edi],xmm5
+	xorps	xmm7,xmm1
+	movups	[64+edi],xmm6
+	pshufd	xmm2,xmm0,19
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	movdqa	xmm3,[96+esp]
+	pxor	xmm0,xmm0
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	sub	eax,96
+	jnc	NEAR L$050xts_enc_loop6
+	mov	ecx,DWORD [240+ebp]
+	mov	edx,ebp
+	mov	ebx,ecx
+L$049xts_enc_short:
+	add	eax,96
+	jz	NEAR L$051xts_enc_done6x
+	movdqa	xmm5,xmm1
+	cmp	eax,32
+	jb	NEAR L$052xts_enc_one
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	je	NEAR L$053xts_enc_two
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	xmm6,xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	cmp	eax,64
+	jb	NEAR L$054xts_enc_three
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	xmm7,xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	movdqa	[esp],xmm5
+	movdqa	[16+esp],xmm6
+	je	NEAR L$055xts_enc_four
+	movdqa	[32+esp],xmm7
+	pshufd	xmm7,xmm0,19
+	movdqa	[48+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm7,xmm3
+	pxor	xmm7,xmm1
+	movdqu	xmm2,[esi]
+	movdqu	xmm3,[16+esi]
+	movdqu	xmm4,[32+esi]
+	pxor	xmm2,[esp]
+	movdqu	xmm5,[48+esi]
+	pxor	xmm3,[16+esp]
+	movdqu	xmm6,[64+esi]
+	pxor	xmm4,[32+esp]
+	lea	esi,[80+esi]
+	pxor	xmm5,[48+esp]
+	movdqa	[64+esp],xmm7
+	pxor	xmm6,xmm7
+	call	__aesni_encrypt6
+	movaps	xmm1,[64+esp]
+	xorps	xmm2,[esp]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,[32+esp]
+	movups	[edi],xmm2
+	xorps	xmm5,[48+esp]
+	movups	[16+edi],xmm3
+	xorps	xmm6,xmm1
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	lea	edi,[80+edi]
+	jmp	NEAR L$056xts_enc_done
+align	16
+L$052xts_enc_one:
+	movups	xmm2,[esi]
+	lea	esi,[16+esi]
+	xorps	xmm2,xmm5
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$057enc1_loop_9:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$057enc1_loop_9
+db	102,15,56,221,209
+	xorps	xmm2,xmm5
+	movups	[edi],xmm2
+	lea	edi,[16+edi]
+	movdqa	xmm1,xmm5
+	jmp	NEAR L$056xts_enc_done
+align	16
+L$053xts_enc_two:
+	movaps	xmm6,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	lea	esi,[32+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	call	__aesni_encrypt2
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	lea	edi,[32+edi]
+	movdqa	xmm1,xmm6
+	jmp	NEAR L$056xts_enc_done
+align	16
+L$054xts_enc_three:
+	movaps	xmm7,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	movups	xmm4,[32+esi]
+	lea	esi,[48+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	xorps	xmm4,xmm7
+	call	__aesni_encrypt3
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	xorps	xmm4,xmm7
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	lea	edi,[48+edi]
+	movdqa	xmm1,xmm7
+	jmp	NEAR L$056xts_enc_done
+align	16
+L$055xts_enc_four:
+	movaps	xmm6,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	movups	xmm4,[32+esi]
+	xorps	xmm2,[esp]
+	movups	xmm5,[48+esi]
+	lea	esi,[64+esi]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,xmm7
+	xorps	xmm5,xmm6
+	call	__aesni_encrypt4
+	xorps	xmm2,[esp]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,xmm7
+	movups	[edi],xmm2
+	xorps	xmm5,xmm6
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	lea	edi,[64+edi]
+	movdqa	xmm1,xmm6
+	jmp	NEAR L$056xts_enc_done
+align	16
+L$051xts_enc_done6x:
+	mov	eax,DWORD [112+esp]
+	and	eax,15
+	jz	NEAR L$058xts_enc_ret
+	movdqa	xmm5,xmm1
+	mov	DWORD [112+esp],eax
+	jmp	NEAR L$059xts_enc_steal
+align	16
+L$056xts_enc_done:
+	mov	eax,DWORD [112+esp]
+	pxor	xmm0,xmm0
+	and	eax,15
+	jz	NEAR L$058xts_enc_ret
+	pcmpgtd	xmm0,xmm1
+	mov	DWORD [112+esp],eax
+	pshufd	xmm5,xmm0,19
+	paddq	xmm1,xmm1
+	pand	xmm5,[96+esp]
+	pxor	xmm5,xmm1
+L$059xts_enc_steal:
+	movzx	ecx,BYTE [esi]
+	movzx	edx,BYTE [edi-16]
+	lea	esi,[1+esi]
+	mov	BYTE [edi-16],cl
+	mov	BYTE [edi],dl
+	lea	edi,[1+edi]
+	sub	eax,1
+	jnz	NEAR L$059xts_enc_steal
+	sub	edi,DWORD [112+esp]
+	mov	edx,ebp
+	mov	ecx,ebx
+	movups	xmm2,[edi-16]
+	xorps	xmm2,xmm5
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$060enc1_loop_10:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$060enc1_loop_10
+db	102,15,56,221,209
+	xorps	xmm2,xmm5
+	movups	[edi-16],xmm2
+L$058xts_enc_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	movdqa	[esp],xmm0
+	pxor	xmm3,xmm3
+	movdqa	[16+esp],xmm0
+	pxor	xmm4,xmm4
+	movdqa	[32+esp],xmm0
+	pxor	xmm5,xmm5
+	movdqa	[48+esp],xmm0
+	pxor	xmm6,xmm6
+	movdqa	[64+esp],xmm0
+	pxor	xmm7,xmm7
+	movdqa	[80+esp],xmm0
+	mov	esp,DWORD [116+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_xts_decrypt
+align	16
+_aes_hw_xts_decrypt:
+L$_aes_hw_xts_decrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	edx,DWORD [36+esp]
+	mov	esi,DWORD [40+esp]
+	mov	ecx,DWORD [240+edx]
+	movups	xmm2,[esi]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$061enc1_loop_11:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$061enc1_loop_11
+db	102,15,56,221,209
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebp,esp
+	sub	esp,120
+	and	esp,-16
+	xor	ebx,ebx
+	test	eax,15
+	setnz	bl
+	shl	ebx,4
+	sub	eax,ebx
+	mov	DWORD [96+esp],135
+	mov	DWORD [100+esp],0
+	mov	DWORD [104+esp],1
+	mov	DWORD [108+esp],0
+	mov	DWORD [112+esp],eax
+	mov	DWORD [116+esp],ebp
+	mov	ecx,DWORD [240+edx]
+	mov	ebp,edx
+	mov	ebx,ecx
+	movdqa	xmm1,xmm2
+	pxor	xmm0,xmm0
+	movdqa	xmm3,[96+esp]
+	pcmpgtd	xmm0,xmm1
+	and	eax,-16
+	sub	eax,96
+	jc	NEAR L$062xts_dec_short
+	shl	ecx,4
+	mov	ebx,16
+	sub	ebx,ecx
+	lea	edx,[32+ecx*1+edx]
+	jmp	NEAR L$063xts_dec_loop6
+align	16
+L$063xts_dec_loop6:
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[16+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[32+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	[48+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	pshufd	xmm7,xmm0,19
+	movdqa	[64+esp],xmm1
+	paddq	xmm1,xmm1
+	movups	xmm0,[ebp]
+	pand	xmm7,xmm3
+	movups	xmm2,[esi]
+	pxor	xmm7,xmm1
+	mov	ecx,ebx
+	movdqu	xmm3,[16+esi]
+	xorps	xmm2,xmm0
+	movdqu	xmm4,[32+esi]
+	pxor	xmm3,xmm0
+	movdqu	xmm5,[48+esi]
+	pxor	xmm4,xmm0
+	movdqu	xmm6,[64+esi]
+	pxor	xmm5,xmm0
+	movdqu	xmm1,[80+esi]
+	pxor	xmm6,xmm0
+	lea	esi,[96+esi]
+	pxor	xmm2,[esp]
+	movdqa	[80+esp],xmm7
+	pxor	xmm7,xmm1
+	movups	xmm1,[16+ebp]
+	pxor	xmm3,[16+esp]
+	pxor	xmm4,[32+esp]
+db	102,15,56,222,209
+	pxor	xmm5,[48+esp]
+	pxor	xmm6,[64+esp]
+db	102,15,56,222,217
+	pxor	xmm7,xmm0
+	movups	xmm0,[32+ebp]
+db	102,15,56,222,225
+db	102,15,56,222,233
+db	102,15,56,222,241
+db	102,15,56,222,249
+	call	L$_aesni_decrypt6_enter
+	movdqa	xmm1,[80+esp]
+	pxor	xmm0,xmm0
+	xorps	xmm2,[esp]
+	pcmpgtd	xmm0,xmm1
+	xorps	xmm3,[16+esp]
+	movups	[edi],xmm2
+	xorps	xmm4,[32+esp]
+	movups	[16+edi],xmm3
+	xorps	xmm5,[48+esp]
+	movups	[32+edi],xmm4
+	xorps	xmm6,[64+esp]
+	movups	[48+edi],xmm5
+	xorps	xmm7,xmm1
+	movups	[64+edi],xmm6
+	pshufd	xmm2,xmm0,19
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	movdqa	xmm3,[96+esp]
+	pxor	xmm0,xmm0
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	sub	eax,96
+	jnc	NEAR L$063xts_dec_loop6
+	mov	ecx,DWORD [240+ebp]
+	mov	edx,ebp
+	mov	ebx,ecx
+L$062xts_dec_short:
+	add	eax,96
+	jz	NEAR L$064xts_dec_done6x
+	movdqa	xmm5,xmm1
+	cmp	eax,32
+	jb	NEAR L$065xts_dec_one
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	je	NEAR L$066xts_dec_two
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	xmm6,xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	cmp	eax,64
+	jb	NEAR L$067xts_dec_three
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	xmm7,xmm1
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+	movdqa	[esp],xmm5
+	movdqa	[16+esp],xmm6
+	je	NEAR L$068xts_dec_four
+	movdqa	[32+esp],xmm7
+	pshufd	xmm7,xmm0,19
+	movdqa	[48+esp],xmm1
+	paddq	xmm1,xmm1
+	pand	xmm7,xmm3
+	pxor	xmm7,xmm1
+	movdqu	xmm2,[esi]
+	movdqu	xmm3,[16+esi]
+	movdqu	xmm4,[32+esi]
+	pxor	xmm2,[esp]
+	movdqu	xmm5,[48+esi]
+	pxor	xmm3,[16+esp]
+	movdqu	xmm6,[64+esi]
+	pxor	xmm4,[32+esp]
+	lea	esi,[80+esi]
+	pxor	xmm5,[48+esp]
+	movdqa	[64+esp],xmm7
+	pxor	xmm6,xmm7
+	call	__aesni_decrypt6
+	movaps	xmm1,[64+esp]
+	xorps	xmm2,[esp]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,[32+esp]
+	movups	[edi],xmm2
+	xorps	xmm5,[48+esp]
+	movups	[16+edi],xmm3
+	xorps	xmm6,xmm1
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	lea	edi,[80+edi]
+	jmp	NEAR L$069xts_dec_done
+align	16
+L$065xts_dec_one:
+	movups	xmm2,[esi]
+	lea	esi,[16+esi]
+	xorps	xmm2,xmm5
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$070dec1_loop_12:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$070dec1_loop_12
+db	102,15,56,223,209
+	xorps	xmm2,xmm5
+	movups	[edi],xmm2
+	lea	edi,[16+edi]
+	movdqa	xmm1,xmm5
+	jmp	NEAR L$069xts_dec_done
+align	16
+L$066xts_dec_two:
+	movaps	xmm6,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	lea	esi,[32+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	call	__aesni_decrypt2
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	lea	edi,[32+edi]
+	movdqa	xmm1,xmm6
+	jmp	NEAR L$069xts_dec_done
+align	16
+L$067xts_dec_three:
+	movaps	xmm7,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	movups	xmm4,[32+esi]
+	lea	esi,[48+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	xorps	xmm4,xmm7
+	call	__aesni_decrypt3
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	xorps	xmm4,xmm7
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	lea	edi,[48+edi]
+	movdqa	xmm1,xmm7
+	jmp	NEAR L$069xts_dec_done
+align	16
+L$068xts_dec_four:
+	movaps	xmm6,xmm1
+	movups	xmm2,[esi]
+	movups	xmm3,[16+esi]
+	movups	xmm4,[32+esi]
+	xorps	xmm2,[esp]
+	movups	xmm5,[48+esi]
+	lea	esi,[64+esi]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,xmm7
+	xorps	xmm5,xmm6
+	call	__aesni_decrypt4
+	xorps	xmm2,[esp]
+	xorps	xmm3,[16+esp]
+	xorps	xmm4,xmm7
+	movups	[edi],xmm2
+	xorps	xmm5,xmm6
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	lea	edi,[64+edi]
+	movdqa	xmm1,xmm6
+	jmp	NEAR L$069xts_dec_done
+align	16
+L$064xts_dec_done6x:
+	mov	eax,DWORD [112+esp]
+	and	eax,15
+	jz	NEAR L$071xts_dec_ret
+	mov	DWORD [112+esp],eax
+	jmp	NEAR L$072xts_dec_only_one_more
+align	16
+L$069xts_dec_done:
+	mov	eax,DWORD [112+esp]
+	pxor	xmm0,xmm0
+	and	eax,15
+	jz	NEAR L$071xts_dec_ret
+	pcmpgtd	xmm0,xmm1
+	mov	DWORD [112+esp],eax
+	pshufd	xmm2,xmm0,19
+	pxor	xmm0,xmm0
+	movdqa	xmm3,[96+esp]
+	paddq	xmm1,xmm1
+	pand	xmm2,xmm3
+	pcmpgtd	xmm0,xmm1
+	pxor	xmm1,xmm2
+L$072xts_dec_only_one_more:
+	pshufd	xmm5,xmm0,19
+	movdqa	xmm6,xmm1
+	paddq	xmm1,xmm1
+	pand	xmm5,xmm3
+	pxor	xmm5,xmm1
+	mov	edx,ebp
+	mov	ecx,ebx
+	movups	xmm2,[esi]
+	xorps	xmm2,xmm5
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$073dec1_loop_13:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$073dec1_loop_13
+db	102,15,56,223,209
+	xorps	xmm2,xmm5
+	movups	[edi],xmm2
+L$074xts_dec_steal:
+	movzx	ecx,BYTE [16+esi]
+	movzx	edx,BYTE [edi]
+	lea	esi,[1+esi]
+	mov	BYTE [edi],cl
+	mov	BYTE [16+edi],dl
+	lea	edi,[1+edi]
+	sub	eax,1
+	jnz	NEAR L$074xts_dec_steal
+	sub	edi,DWORD [112+esp]
+	mov	edx,ebp
+	mov	ecx,ebx
+	movups	xmm2,[edi]
+	xorps	xmm2,xmm6
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$075dec1_loop_14:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$075dec1_loop_14
+db	102,15,56,223,209
+	xorps	xmm2,xmm6
+	movups	[edi],xmm2
+L$071xts_dec_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	movdqa	[esp],xmm0
+	pxor	xmm3,xmm3
+	movdqa	[16+esp],xmm0
+	pxor	xmm4,xmm4
+	movdqa	[32+esp],xmm0
+	pxor	xmm5,xmm5
+	movdqa	[48+esp],xmm0
+	pxor	xmm6,xmm6
+	movdqa	[64+esp],xmm0
+	pxor	xmm7,xmm7
+	movdqa	[80+esp],xmm0
+	mov	esp,DWORD [116+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_cbc_encrypt
+align	16
+_aes_hw_cbc_encrypt:
+L$_aes_hw_cbc_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	ebx,esp
+	mov	edi,DWORD [24+esp]
+	sub	ebx,24
+	mov	eax,DWORD [28+esp]
+	and	ebx,-16
+	mov	edx,DWORD [32+esp]
+	mov	ebp,DWORD [36+esp]
+	test	eax,eax
+	jz	NEAR L$076cbc_abort
+	cmp	DWORD [40+esp],0
+	xchg	ebx,esp
+	movups	xmm7,[ebp]
+	mov	ecx,DWORD [240+edx]
+	mov	ebp,edx
+	mov	DWORD [16+esp],ebx
+	mov	ebx,ecx
+	je	NEAR L$077cbc_decrypt
+	movaps	xmm2,xmm7
+	cmp	eax,16
+	jb	NEAR L$078cbc_enc_tail
+	sub	eax,16
+	jmp	NEAR L$079cbc_enc_loop
+align	16
+L$079cbc_enc_loop:
+	movups	xmm7,[esi]
+	lea	esi,[16+esi]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	xorps	xmm7,xmm0
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm7
+L$080enc1_loop_15:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$080enc1_loop_15
+db	102,15,56,221,209
+	mov	ecx,ebx
+	mov	edx,ebp
+	movups	[edi],xmm2
+	lea	edi,[16+edi]
+	sub	eax,16
+	jnc	NEAR L$079cbc_enc_loop
+	add	eax,16
+	jnz	NEAR L$078cbc_enc_tail
+	movaps	xmm7,xmm2
+	pxor	xmm2,xmm2
+	jmp	NEAR L$081cbc_ret
+L$078cbc_enc_tail:
+	mov	ecx,eax
+dd	2767451785
+	mov	ecx,16
+	sub	ecx,eax
+	xor	eax,eax
+dd	2868115081
+	lea	edi,[edi-16]
+	mov	ecx,ebx
+	mov	esi,edi
+	mov	edx,ebp
+	jmp	NEAR L$079cbc_enc_loop
+align	16
+L$077cbc_decrypt:
+	cmp	eax,80
+	jbe	NEAR L$082cbc_dec_tail
+	movaps	[esp],xmm7
+	sub	eax,80
+	jmp	NEAR L$083cbc_dec_loop6_enter
+align	16
+L$084cbc_dec_loop6:
+	movaps	[esp],xmm0
+	movups	[edi],xmm7
+	lea	edi,[16+edi]
+L$083cbc_dec_loop6_enter:
+	movdqu	xmm2,[esi]
+	movdqu	xmm3,[16+esi]
+	movdqu	xmm4,[32+esi]
+	movdqu	xmm5,[48+esi]
+	movdqu	xmm6,[64+esi]
+	movdqu	xmm7,[80+esi]
+	call	__aesni_decrypt6
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,[esp]
+	xorps	xmm3,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm4,xmm0
+	movups	xmm0,[48+esi]
+	xorps	xmm5,xmm1
+	movups	xmm1,[64+esi]
+	xorps	xmm6,xmm0
+	movups	xmm0,[80+esi]
+	xorps	xmm7,xmm1
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	lea	esi,[96+esi]
+	movups	[32+edi],xmm4
+	mov	ecx,ebx
+	movups	[48+edi],xmm5
+	mov	edx,ebp
+	movups	[64+edi],xmm6
+	lea	edi,[80+edi]
+	sub	eax,96
+	ja	NEAR L$084cbc_dec_loop6
+	movaps	xmm2,xmm7
+	movaps	xmm7,xmm0
+	add	eax,80
+	jle	NEAR L$085cbc_dec_clear_tail_collected
+	movups	[edi],xmm2
+	lea	edi,[16+edi]
+L$082cbc_dec_tail:
+	movups	xmm2,[esi]
+	movaps	xmm6,xmm2
+	cmp	eax,16
+	jbe	NEAR L$086cbc_dec_one
+	movups	xmm3,[16+esi]
+	movaps	xmm5,xmm3
+	cmp	eax,32
+	jbe	NEAR L$087cbc_dec_two
+	movups	xmm4,[32+esi]
+	cmp	eax,48
+	jbe	NEAR L$088cbc_dec_three
+	movups	xmm5,[48+esi]
+	cmp	eax,64
+	jbe	NEAR L$089cbc_dec_four
+	movups	xmm6,[64+esi]
+	movaps	[esp],xmm7
+	movups	xmm2,[esi]
+	xorps	xmm7,xmm7
+	call	__aesni_decrypt6
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,[esp]
+	xorps	xmm3,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm4,xmm0
+	movups	xmm0,[48+esi]
+	xorps	xmm5,xmm1
+	movups	xmm7,[64+esi]
+	xorps	xmm6,xmm0
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	pxor	xmm3,xmm3
+	movups	[32+edi],xmm4
+	pxor	xmm4,xmm4
+	movups	[48+edi],xmm5
+	pxor	xmm5,xmm5
+	lea	edi,[64+edi]
+	movaps	xmm2,xmm6
+	pxor	xmm6,xmm6
+	sub	eax,80
+	jmp	NEAR L$090cbc_dec_tail_collected
+align	16
+L$086cbc_dec_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$091dec1_loop_16:
+db	102,15,56,222,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$091dec1_loop_16
+db	102,15,56,223,209
+	xorps	xmm2,xmm7
+	movaps	xmm7,xmm6
+	sub	eax,16
+	jmp	NEAR L$090cbc_dec_tail_collected
+align	16
+L$087cbc_dec_two:
+	call	__aesni_decrypt2
+	xorps	xmm2,xmm7
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movaps	xmm2,xmm3
+	pxor	xmm3,xmm3
+	lea	edi,[16+edi]
+	movaps	xmm7,xmm5
+	sub	eax,32
+	jmp	NEAR L$090cbc_dec_tail_collected
+align	16
+L$088cbc_dec_three:
+	call	__aesni_decrypt3
+	xorps	xmm2,xmm7
+	xorps	xmm3,xmm6
+	xorps	xmm4,xmm5
+	movups	[edi],xmm2
+	movaps	xmm2,xmm4
+	pxor	xmm4,xmm4
+	movups	[16+edi],xmm3
+	pxor	xmm3,xmm3
+	lea	edi,[32+edi]
+	movups	xmm7,[32+esi]
+	sub	eax,48
+	jmp	NEAR L$090cbc_dec_tail_collected
+align	16
+L$089cbc_dec_four:
+	call	__aesni_decrypt4
+	movups	xmm1,[16+esi]
+	movups	xmm0,[32+esi]
+	xorps	xmm2,xmm7
+	movups	xmm7,[48+esi]
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	xorps	xmm4,xmm1
+	movups	[16+edi],xmm3
+	pxor	xmm3,xmm3
+	xorps	xmm5,xmm0
+	movups	[32+edi],xmm4
+	pxor	xmm4,xmm4
+	lea	edi,[48+edi]
+	movaps	xmm2,xmm5
+	pxor	xmm5,xmm5
+	sub	eax,64
+	jmp	NEAR L$090cbc_dec_tail_collected
+align	16
+L$085cbc_dec_clear_tail_collected:
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+L$090cbc_dec_tail_collected:
+	and	eax,15
+	jnz	NEAR L$092cbc_dec_tail_partial
+	movups	[edi],xmm2
+	pxor	xmm0,xmm0
+	jmp	NEAR L$081cbc_ret
+align	16
+L$092cbc_dec_tail_partial:
+	movaps	[esp],xmm2
+	pxor	xmm0,xmm0
+	mov	ecx,16
+	mov	esi,esp
+	sub	ecx,eax
+dd	2767451785
+	movdqa	[esp],xmm2
+L$081cbc_ret:
+	mov	esp,DWORD [16+esp]
+	mov	ebp,DWORD [36+esp]
+	pxor	xmm2,xmm2
+	pxor	xmm1,xmm1
+	movups	[ebp],xmm7
+	pxor	xmm7,xmm7
+L$076cbc_abort:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__aesni_set_encrypt_key:
+	push	ebp
+	push	ebx
+	test	eax,eax
+	jz	NEAR L$093bad_pointer
+	test	edx,edx
+	jz	NEAR L$093bad_pointer
+	call	L$094pic
+L$094pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$094pic)+ebx]
+	lea	ebp,[_OPENSSL_ia32cap_P]
+	movups	xmm0,[eax]
+	xorps	xmm4,xmm4
+	mov	ebp,DWORD [4+ebp]
+	lea	edx,[16+edx]
+	and	ebp,268437504
+	cmp	ecx,256
+	je	NEAR L$09514rounds
+	cmp	ecx,192
+	je	NEAR L$09612rounds
+	cmp	ecx,128
+	jne	NEAR L$097bad_keybits
+align	16
+L$09810rounds:
+	cmp	ebp,268435456
+	je	NEAR L$09910rounds_alt
+	mov	ecx,9
+	movups	[edx-16],xmm0
+db	102,15,58,223,200,1
+	call	L$100key_128_cold
+db	102,15,58,223,200,2
+	call	L$101key_128
+db	102,15,58,223,200,4
+	call	L$101key_128
+db	102,15,58,223,200,8
+	call	L$101key_128
+db	102,15,58,223,200,16
+	call	L$101key_128
+db	102,15,58,223,200,32
+	call	L$101key_128
+db	102,15,58,223,200,64
+	call	L$101key_128
+db	102,15,58,223,200,128
+	call	L$101key_128
+db	102,15,58,223,200,27
+	call	L$101key_128
+db	102,15,58,223,200,54
+	call	L$101key_128
+	movups	[edx],xmm0
+	mov	DWORD [80+edx],ecx
+	jmp	NEAR L$102good_key
+align	16
+L$101key_128:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+L$100key_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$09910rounds_alt:
+	movdqa	xmm5,[ebx]
+	mov	ecx,8
+	movdqa	xmm4,[32+ebx]
+	movdqa	xmm2,xmm0
+	movdqu	[edx-16],xmm0
+L$103loop_key128:
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	lea	edx,[16+edx]
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx-16],xmm0
+	movdqa	xmm2,xmm0
+	dec	ecx
+	jnz	NEAR L$103loop_key128
+	movdqa	xmm4,[48+ebx]
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	movdqa	xmm2,xmm0
+db	102,15,56,0,197
+db	102,15,56,221,196
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[16+edx],xmm0
+	mov	ecx,9
+	mov	DWORD [96+edx],ecx
+	jmp	NEAR L$102good_key
+align	16
+L$09612rounds:
+	movq	xmm2,[16+eax]
+	cmp	ebp,268435456
+	je	NEAR L$10412rounds_alt
+	mov	ecx,11
+	movups	[edx-16],xmm0
+db	102,15,58,223,202,1
+	call	L$105key_192a_cold
+db	102,15,58,223,202,2
+	call	L$106key_192b
+db	102,15,58,223,202,4
+	call	L$107key_192a
+db	102,15,58,223,202,8
+	call	L$106key_192b
+db	102,15,58,223,202,16
+	call	L$107key_192a
+db	102,15,58,223,202,32
+	call	L$106key_192b
+db	102,15,58,223,202,64
+	call	L$107key_192a
+db	102,15,58,223,202,128
+	call	L$106key_192b
+	movups	[edx],xmm0
+	mov	DWORD [48+edx],ecx
+	jmp	NEAR L$102good_key
+align	16
+L$107key_192a:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+align	16
+L$105key_192a_cold:
+	movaps	xmm5,xmm2
+L$108key_192b_warm:
+	shufps	xmm4,xmm0,16
+	movdqa	xmm3,xmm2
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	pslldq	xmm3,4
+	xorps	xmm0,xmm4
+	pshufd	xmm1,xmm1,85
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm0,255
+	pxor	xmm2,xmm3
+	ret
+align	16
+L$106key_192b:
+	movaps	xmm3,xmm0
+	shufps	xmm5,xmm0,68
+	movups	[edx],xmm5
+	shufps	xmm3,xmm2,78
+	movups	[16+edx],xmm3
+	lea	edx,[32+edx]
+	jmp	NEAR L$108key_192b_warm
+align	16
+L$10412rounds_alt:
+	movdqa	xmm5,[16+ebx]
+	movdqa	xmm4,[32+ebx]
+	mov	ecx,8
+	movdqu	[edx-16],xmm0
+L$109loop_key192:
+	movq	[edx],xmm2
+	movdqa	xmm1,xmm2
+db	102,15,56,0,213
+db	102,15,56,221,212
+	pslld	xmm4,1
+	lea	edx,[24+edx]
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pshufd	xmm3,xmm0,255
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pxor	xmm0,xmm2
+	pxor	xmm2,xmm3
+	movdqu	[edx-16],xmm0
+	dec	ecx
+	jnz	NEAR L$109loop_key192
+	mov	ecx,11
+	mov	DWORD [32+edx],ecx
+	jmp	NEAR L$102good_key
+align	16
+L$09514rounds:
+	movups	xmm2,[16+eax]
+	lea	edx,[16+edx]
+	cmp	ebp,268435456
+	je	NEAR L$11014rounds_alt
+	mov	ecx,13
+	movups	[edx-32],xmm0
+	movups	[edx-16],xmm2
+db	102,15,58,223,202,1
+	call	L$111key_256a_cold
+db	102,15,58,223,200,1
+	call	L$112key_256b
+db	102,15,58,223,202,2
+	call	L$113key_256a
+db	102,15,58,223,200,2
+	call	L$112key_256b
+db	102,15,58,223,202,4
+	call	L$113key_256a
+db	102,15,58,223,200,4
+	call	L$112key_256b
+db	102,15,58,223,202,8
+	call	L$113key_256a
+db	102,15,58,223,200,8
+	call	L$112key_256b
+db	102,15,58,223,202,16
+	call	L$113key_256a
+db	102,15,58,223,200,16
+	call	L$112key_256b
+db	102,15,58,223,202,32
+	call	L$113key_256a
+db	102,15,58,223,200,32
+	call	L$112key_256b
+db	102,15,58,223,202,64
+	call	L$113key_256a
+	movups	[edx],xmm0
+	mov	DWORD [16+edx],ecx
+	xor	eax,eax
+	jmp	NEAR L$102good_key
+align	16
+L$113key_256a:
+	movups	[edx],xmm2
+	lea	edx,[16+edx]
+L$111key_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$112key_256b:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+align	16
+L$11014rounds_alt:
+	movdqa	xmm5,[ebx]
+	movdqa	xmm4,[32+ebx]
+	mov	ecx,7
+	movdqu	[edx-32],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	[edx-16],xmm2
+L$114loop_key256:
+db	102,15,56,0,213
+db	102,15,56,221,212
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	dec	ecx
+	jz	NEAR L$115done_key256
+	pshufd	xmm2,xmm0,255
+	pxor	xmm3,xmm3
+db	102,15,56,221,211
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+	pxor	xmm2,xmm1
+	movdqu	[16+edx],xmm2
+	lea	edx,[32+edx]
+	movdqa	xmm1,xmm2
+	jmp	NEAR L$114loop_key256
+L$115done_key256:
+	mov	ecx,13
+	mov	DWORD [16+edx],ecx
+L$102good_key:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	xor	eax,eax
+	pop	ebx
+	pop	ebp
+	ret
+align	4
+L$093bad_pointer:
+	mov	eax,-1
+	pop	ebx
+	pop	ebp
+	ret
+align	4
+L$097bad_keybits:
+	pxor	xmm0,xmm0
+	mov	eax,-2
+	pop	ebx
+	pop	ebp
+	ret
+global	_aes_hw_set_encrypt_key
+align	16
+_aes_hw_set_encrypt_key:
+L$_aes_hw_set_encrypt_key_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$116pic
+L$116pic:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	call	__aesni_set_encrypt_key
+	ret
+global	_aes_hw_set_decrypt_key
+align	16
+_aes_hw_set_decrypt_key:
+L$_aes_hw_set_decrypt_key_begin:
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	call	__aesni_set_encrypt_key
+	mov	edx,DWORD [12+esp]
+	shl	ecx,4
+	test	eax,eax
+	jnz	NEAR L$117dec_key_ret
+	lea	eax,[16+ecx*1+edx]
+	movups	xmm0,[edx]
+	movups	xmm1,[eax]
+	movups	[eax],xmm0
+	movups	[edx],xmm1
+	lea	edx,[16+edx]
+	lea	eax,[eax-16]
+L$118dec_key_inverse:
+	movups	xmm0,[edx]
+	movups	xmm1,[eax]
+db	102,15,56,219,192
+db	102,15,56,219,201
+	lea	edx,[16+edx]
+	lea	eax,[eax-16]
+	movups	[16+eax],xmm0
+	movups	[edx-16],xmm1
+	cmp	eax,edx
+	ja	NEAR L$118dec_key_inverse
+	movups	xmm0,[edx]
+db	102,15,56,219,192
+	movups	[edx],xmm0
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	xor	eax,eax
+L$117dec_key_ret:
+	ret
+align	64
+L$key_const:
+dd	202313229,202313229,202313229,202313229
+dd	67569157,67569157,67569157,67569157
+dd	1,1,1,1
+dd	27,27,27,27
+db	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+db	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+db	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+db	115,108,46,111,114,103,62,0
+segment	.bss
+common	_OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
new file mode 100644
index 0000000..f3505b9
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -0,0 +1,2359 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+
+.p2align	4
+_aes_hw_encrypt:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+1(%rip)
+#endif
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+L$oop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	L$oop_enc1_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	ret
+
+
+
+.globl	_aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+
+.p2align	4
+_aes_hw_decrypt:
+
+_CET_ENDBR
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+L$oop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	L$oop_dec1_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt2:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$enc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+
+
+
+.p2align	4
+_aesni_decrypt2:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$dec_loop2:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$dec_loop2
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt3:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$enc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+
+
+
+.p2align	4
+_aesni_decrypt3:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+L$dec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$dec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt4:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+L$enc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+
+
+
+.p2align	4
+_aesni_decrypt4:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+L$dec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$dec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt6:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$enc_loop6_enter
+.p2align	4
+L$enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+L$enc_loop6_enter:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+
+
+
+.p2align	4
+_aesni_decrypt6:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$dec_loop6_enter
+.p2align	4
+L$dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+L$dec_loop6_enter:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$dec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+
+
+
+.p2align	4
+_aesni_encrypt8:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$enc_loop8_inner
+.p2align	4
+L$enc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+L$enc_loop8_inner:
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+L$enc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$enc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	ret
+
+
+
+.p2align	4
+_aesni_decrypt8:
+
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	L$dec_loop8_inner
+.p2align	4
+L$dec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+L$dec_loop8_inner:
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+L$dec_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$dec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	ret
+
+
+.globl	_aes_hw_ecb_encrypt
+.private_extern _aes_hw_ecb_encrypt
+
+.p2align	4
+_aes_hw_ecb_encrypt:
+
+_CET_ENDBR
+	andq	$-16,%rdx
+	jz	L$ecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	L$ecb_decrypt
+
+	cmpq	$0x80,%rdx
+	jb	L$ecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	L$ecb_enc_loop8_enter
+.p2align	4
+L$ecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+L$ecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$0x80,%rdx
+	jnc	L$ecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	L$ecb_ret
+
+L$ecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	L$ecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	L$ecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	L$ecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	L$ecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	L$ecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	L$ecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_enc1_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_two:
+	call	_aesni_encrypt2
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	L$ecb_ret
+
+.p2align	4
+L$ecb_decrypt:
+	cmpq	$0x80,%rdx
+	jb	L$ecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	L$ecb_dec_loop8_enter
+.p2align	4
+L$ecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+L$ecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$0x80,%rdx
+	jnc	L$ecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	movups	%xmm9,112(%rsi)
+	pxor	%xmm9,%xmm9
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	L$ecb_ret
+
+L$ecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	L$ecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	L$ecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	L$ecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	L$ecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	L$ecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	L$ecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_dec1_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_two:
+	call	_aesni_decrypt2
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	jmp	L$ecb_ret
+.p2align	4
+L$ecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+
+L$ecb_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	ret
+
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+
+.p2align	4
+_aes_hw_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,_BORINGSSL_function_hit(%rip)
+#endif
+	cmpq	$1,%rdx
+	jne	L$ctr32_bulk
+
+
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_enc1_5
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	L$ctr32_epilogue
+
+.p2align	4
+L$ctr32_bulk:
+	leaq	(%rsp),%r11
+
+	pushq	%rbp
+
+	subq	$128,%rsp
+	andq	$-16,%rsp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%ebp
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%ebp,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%ebp,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
+	movl	240(%rcx),%eax
+	xorl	%ebp,%r9d
+	bswapl	%r10d
+	movl	%r9d,80+12(%rsp)
+	xorl	%ebp,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	xorl	%ebp,%r9d
+	movl	%r9d,112+12(%rsp)
+
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
+	jb	L$ctr32_tail
+
+	leaq	128(%rcx),%rcx
+	subq	$8,%rdx
+	jmp	L$ctr32_loop8
+
+.p2align	5
+L$ctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
+.byte	102,15,56,220,209
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
+.byte	102,15,56,220,217
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
+.byte	102,15,56,220,225
+	xorl	%ebp,%r9d
+	nop
+.byte	102,15,56,220,233
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%ebp,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
+
+	jb	L$ctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+	je	L$ctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	L$ctr32_enc_done
+
+.p2align	4
+L$ctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	prefetcht0	448(%rdi)
+	prefetcht0	512(%rdi)
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	L$ctr32_loop8
+
+	addq	$8,%rdx
+	jz	L$ctr32_done
+	leaq	-128(%rcx),%rcx
+
+L$ctr32_tail:
+
+
+	leaq	16(%rcx),%rcx
+	cmpq	$4,%rdx
+	jb	L$ctr32_loop3
+	je	L$ctr32_loop4
+
+
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
+
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+
+	call	L$enc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	L$ctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	L$ctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
+	jmp	L$ctr32_done
+
+.p2align	5
+L$ctr32_loop4:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx),%xmm1
+	jnz	L$ctr32_loop4
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
+
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
+	jmp	L$ctr32_done
+
+.p2align	5
+L$ctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	L$ctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
+
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	L$ctr32_done
+
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	L$ctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
+L$ctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%ebp,%ebp
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	movq	-8(%r11),%rbp
+
+	leaq	(%r11),%rsp
+
+L$ctr32_epilogue:
+	ret
+
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+
+.p2align	4
+_aes_hw_cbc_encrypt:
+
+_CET_ENDBR
+	testq	%rdx,%rdx
+	jz	L$cbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	L$cbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	L$cbc_enc_tail
+	subq	$16,%rdx
+	jmp	L$cbc_enc_loop
+.p2align	4
+L$cbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+L$oop_enc1_6:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_enc1_6
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	L$cbc_enc_loop
+	addq	$16,%rdx
+	jnz	L$cbc_enc_tail
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%r8)
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	jmp	L$cbc_ret
+
+L$cbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	L$cbc_enc_loop
+
+.p2align	4
+L$cbc_decrypt:
+	cmpq	$16,%rdx
+	jne	L$cbc_decrypt_bulk
+
+
+
+	movdqu	(%rdi),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	%xmm2,%xmm4
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_dec1_7:
+.byte	102,15,56,222,209
+	decl	%r10d
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_dec1_7
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqu	%xmm4,(%r8)
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	L$cbc_ret
+.p2align	4
+L$cbc_decrypt_bulk:
+	leaq	(%rsp),%r11
+
+	pushq	%rbp
+
+	subq	$16,%rsp
+	andq	$-16,%rsp
+	movq	%rcx,%rbp
+	movups	(%r8),%xmm10
+	movl	%r10d,%eax
+	cmpq	$0x50,%rdx
+	jbe	L$cbc_dec_tail
+
+	movups	(%rcx),%xmm0
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+	cmpq	$0x70,%rdx
+	jbe	L$cbc_dec_six_or_seven
+
+	subq	$0x70,%rdx
+	leaq	112(%rcx),%rcx
+	jmp	L$cbc_dec_loop8_enter
+.p2align	4
+L$cbc_dec_loop8:
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+L$cbc_dec_loop8_enter:
+	movdqu	96(%rdi),%xmm8
+	pxor	%xmm0,%xmm2
+	movdqu	112(%rdi),%xmm9
+	pxor	%xmm0,%xmm3
+	movups	16-112(%rcx),%xmm1
+	pxor	%xmm0,%xmm4
+	movq	$-1,%rbp
+	cmpq	$0x70,%rdx
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm9
+	movups	32-112(%rcx),%xmm0
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+	adcq	$0,%rbp
+	andq	$128,%rbp
+.byte	102,68,15,56,222,201
+	addq	%rdi,%rbp
+	movups	48-112(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	64-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	80-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	96-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	112-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	128-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	144-112(%rcx),%xmm1
+	cmpl	$11,%eax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	160-112(%rcx),%xmm0
+	jb	L$cbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	176-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	192-112(%rcx),%xmm0
+	je	L$cbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	208-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	224-112(%rcx),%xmm0
+	jmp	L$cbc_dec_done
+.p2align	4
+L$cbc_dec_done:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm10
+	pxor	%xmm0,%xmm11
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm12
+	pxor	%xmm0,%xmm13
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movdqu	80(%rdi),%xmm1
+
+.byte	102,65,15,56,223,210
+	movdqu	96(%rdi),%xmm10
+	pxor	%xmm0,%xmm1
+.byte	102,65,15,56,223,219
+	pxor	%xmm0,%xmm10
+	movdqu	112(%rdi),%xmm0
+.byte	102,65,15,56,223,228
+	leaq	128(%rdi),%rdi
+	movdqu	0(%rbp),%xmm11
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+	movdqu	16(%rbp),%xmm12
+	movdqu	32(%rbp),%xmm13
+.byte	102,65,15,56,223,255
+.byte	102,68,15,56,223,193
+	movdqu	48(%rbp),%xmm14
+	movdqu	64(%rbp),%xmm15
+.byte	102,69,15,56,223,202
+	movdqa	%xmm0,%xmm10
+	movdqu	80(%rbp),%xmm1
+	movups	-112(%rcx),%xmm0
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm1,%xmm7
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+
+	subq	$0x80,%rdx
+	ja	L$cbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	leaq	-112(%rcx),%rcx
+	addq	$0x70,%rdx
+	jle	L$cbc_dec_clear_tail_collected
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+	cmpq	$0x50,%rdx
+	jbe	L$cbc_dec_tail
+
+	movaps	%xmm11,%xmm2
+L$cbc_dec_six_or_seven:
+	cmpq	$0x60,%rdx
+	ja	L$cbc_dec_seven
+
+	movaps	%xmm7,%xmm8
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	leaq	80(%rsi),%rsi
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm7,%xmm7
+	jmp	L$cbc_dec_tail_collected
+
+.p2align	4
+L$cbc_dec_seven:
+	movups	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	80(%rdi),%xmm9
+	pxor	%xmm10,%xmm2
+	movups	96(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm9,%xmm8
+	movdqu	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	leaq	96(%rsi),%rsi
+	movdqa	%xmm8,%xmm2
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	L$cbc_dec_tail_collected
+
+L$cbc_dec_tail:
+	movups	(%rdi),%xmm2
+	subq	$0x10,%rdx
+	jbe	L$cbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm2,%xmm11
+	subq	$0x10,%rdx
+	jbe	L$cbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm3,%xmm12
+	subq	$0x10,%rdx
+	jbe	L$cbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	movaps	%xmm4,%xmm13
+	subq	$0x10,%rdx
+	jbe	L$cbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	movaps	%xmm5,%xmm14
+	movaps	%xmm6,%xmm15
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm15,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	leaq	64(%rsi),%rsi
+	movdqa	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	subq	$0x10,%rdx
+	jmp	L$cbc_dec_tail_collected
+
+.p2align	4
+L$cbc_dec_one:
+	movaps	%xmm2,%xmm11
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+L$oop_dec1_8:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	L$oop_dec1_8
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movaps	%xmm11,%xmm10
+	jmp	L$cbc_dec_tail_collected
+.p2align	4
+L$cbc_dec_two:
+	movaps	%xmm3,%xmm12
+	call	_aesni_decrypt2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm12,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leaq	16(%rsi),%rsi
+	jmp	L$cbc_dec_tail_collected
+.p2align	4
+L$cbc_dec_three:
+	movaps	%xmm4,%xmm13
+	call	_aesni_decrypt3
+	pxor	%xmm10,%xmm2
+	movaps	%xmm13,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	leaq	32(%rsi),%rsi
+	jmp	L$cbc_dec_tail_collected
+.p2align	4
+L$cbc_dec_four:
+	movaps	%xmm5,%xmm14
+	call	_aesni_decrypt4
+	pxor	%xmm10,%xmm2
+	movaps	%xmm14,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	leaq	48(%rsi),%rsi
+	jmp	L$cbc_dec_tail_collected
+
+.p2align	4
+L$cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+L$cbc_dec_tail_collected:
+	movups	%xmm10,(%r8)
+	andq	$15,%rdx
+	jnz	L$cbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	L$cbc_dec_ret
+.p2align	4
+L$cbc_dec_tail_partial:
+	movaps	%xmm2,(%rsp)
+	pxor	%xmm2,%xmm2
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	(%rsp),%rsi
+.long	0x9066A4F3
+	movdqa	%xmm2,(%rsp)
+
+L$cbc_dec_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movq	-8(%r11),%rbp
+
+	leaq	(%r11),%rsp
+
+L$cbc_ret:
+	ret
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+
+.p2align	4
+_aes_hw_set_decrypt_key:
+
+_CET_ENDBR
+.byte	0x48,0x83,0xEC,0x08
+
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	L$dec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+L$dec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	L$dec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	pxor	%xmm1,%xmm1
+	movups	%xmm0,(%rdi)
+	pxor	%xmm0,%xmm0
+L$dec_key_ret:
+	addq	$8,%rsp
+
+	ret
+
+L$SEH_end_set_decrypt_key:
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+
+.p2align	4
+_aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,_BORINGSSL_function_hit+3(%rip)
+#endif
+.byte	0x48,0x83,0xEC,0x08
+
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	L$enc_key_ret
+	testq	%rdx,%rdx
+	jz	L$enc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	_OPENSSL_ia32cap_P(%rip),%r10
+	movl	4(%r10),%r10d
+	andl	$268437504,%r10d
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	L$14rounds
+	cmpl	$192,%esi
+	je	L$12rounds
+	cmpl	$128,%esi
+	jne	L$bad_keybits
+
+L$10rounds:
+	movl	$9,%esi
+	cmpl	$268435456,%r10d
+	je	L$10rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	L$key_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,4
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,8
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,16
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,32
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,64
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,128
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,27
+	call	L$key_expansion_128
+.byte	102,15,58,223,200,54
+	call	L$key_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$10rounds_alt:
+	movdqa	L$key_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	L$oop_key128
+
+.p2align	4
+L$oop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	L$oop_key128
+
+	movdqa	L$key_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	cmpl	$268435456,%r10d
+	je	L$12rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	L$key_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,4
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,8
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,16
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,32
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,64
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,128
+	call	L$key_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$12rounds_alt:
+	movdqa	L$key_rotate192(%rip),%xmm5
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	L$oop_key192
+
+.p2align	4
+L$oop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	L$oop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	cmpl	$268435456,%r10d
+	je	L$14rounds_alt
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	L$key_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,2
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,2
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,4
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,4
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,8
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,8
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,16
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,16
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,32
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,32
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,64
+	call	L$key_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$14rounds_alt:
+	movdqa	L$key_rotate(%rip),%xmm5
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	L$oop_key256
+
+.p2align	4
+L$oop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	L$done_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	L$oop_key256
+
+L$done_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$bad_keybits:
+	movq	$-2,%rax
+L$enc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+
+	ret
+
+L$SEH_end_set_encrypt_key:
+
+.p2align	4
+L$key_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+.p2align	4
+L$key_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+L$key_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+.p2align	4
+L$key_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	L$key_expansion_192b_warm
+
+.p2align	4
+L$key_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+.p2align	4
+L$key_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$increment32:
+.long	6,6,6,0
+L$increment64:
+.long	1,0,0,0
+L$xts_magic:
+.long	0x87,0,1,0
+L$increment1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long	1,1,1,1
+L$key_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
new file mode 100644
index 0000000..68742fb
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -0,0 +1,2361 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl	aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type	aes_hw_encrypt,@function
+.align	16
+aes_hw_encrypt:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+1(%rip)
+#endif
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_enc1_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	ret
+.cfi_endproc	
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+
+.globl	aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type	aes_hw_decrypt,@function
+.align	16
+aes_hw_decrypt:
+.cfi_startproc	
+_CET_ENDBR
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_dec1_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	ret
+.cfi_endproc	
+.size	aes_hw_decrypt, .-aes_hw_decrypt
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_decrypt2,@function
+.align	16
+_aesni_decrypt2:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Ldec_loop2:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop2
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.cfi_endproc	
+.size	_aesni_decrypt2,.-_aesni_decrypt2
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Ldec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.cfi_endproc	
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+.Ldec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.cfi_endproc	
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.Lenc_loop6_enter:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Ldec_loop6_enter
+.align	16
+.Ldec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.Ldec_loop6_enter:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.cfi_endproc	
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop8_inner
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.Lenc_loop8_inner:
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.Lenc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	ret
+.cfi_endproc	
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.type	_aesni_decrypt8,@function
+.align	16
+_aesni_decrypt8:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Ldec_loop8_inner
+.align	16
+.Ldec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.Ldec_loop8_inner:
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.Ldec_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	ret
+.cfi_endproc	
+.size	_aesni_decrypt8,.-_aesni_decrypt8
+.globl	aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type	aes_hw_ecb_encrypt,@function
+.align	16
+aes_hw_ecb_encrypt:
+.cfi_startproc	
+_CET_ENDBR
+	andq	$-16,%rdx
+	jz	.Lecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	.Lecb_decrypt
+
+	cmpq	$0x80,%rdx
+	jb	.Lecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	.Lecb_enc_loop8_enter
+.align	16
+.Lecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$0x80,%rdx
+	jnc	.Lecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	.Lecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	.Lecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	.Lecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	call	_aesni_encrypt2
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	.Lecb_ret
+
+.align	16
+.Lecb_decrypt:
+	cmpq	$0x80,%rdx
+	jb	.Lecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	.Lecb_dec_loop8_enter
+.align	16
+.Lecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$0x80,%rdx
+	jnc	.Lecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	movups	%xmm9,112(%rsi)
+	pxor	%xmm9,%xmm9
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	.Lecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	.Lecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	.Lecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	call	_aesni_decrypt2
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+
+.Lecb_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	ret
+.cfi_endproc	
+.size	aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit(%rip)
+#endif
+	cmpq	$1,%rdx
+	jne	.Lctr32_bulk
+
+
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_5
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	.Lctr32_epilogue
+
+.align	16
+.Lctr32_bulk:
+	leaq	(%rsp),%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbp
+.cfi_offset	%rbp,-16
+	subq	$128,%rsp
+	andq	$-16,%rsp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%ebp
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%ebp,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%ebp,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
+	movl	240(%rcx),%eax
+	xorl	%ebp,%r9d
+	bswapl	%r10d
+	movl	%r9d,80+12(%rsp)
+	xorl	%ebp,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	xorl	%ebp,%r9d
+	movl	%r9d,112+12(%rsp)
+
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
+	jb	.Lctr32_tail
+
+	leaq	128(%rcx),%rcx
+	subq	$8,%rdx
+	jmp	.Lctr32_loop8
+
+.align	32
+.Lctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
+.byte	102,15,56,220,209
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
+.byte	102,15,56,220,217
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
+.byte	102,15,56,220,225
+	xorl	%ebp,%r9d
+	nop
+.byte	102,15,56,220,233
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%ebp,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
+
+	jb	.Lctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+	je	.Lctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	.Lctr32_enc_done
+
+.align	16
+.Lctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	prefetcht0	448(%rdi)
+	prefetcht0	512(%rdi)
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	.Lctr32_loop8
+
+	addq	$8,%rdx
+	jz	.Lctr32_done
+	leaq	-128(%rcx),%rcx
+
+.Lctr32_tail:
+
+
+	leaq	16(%rcx),%rcx
+	cmpq	$4,%rdx
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
+
+
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
+
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+
+	call	.Lenc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	.Lctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	.Lctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop4:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop4
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
+
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
+
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	.Lctr32_done
+
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	.Lctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
+.Lctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%ebp,%ebp
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lctr32_epilogue:
+	ret
+.cfi_endproc	
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl	aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,@function
+.align	16
+aes_hw_cbc_encrypt:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%rdx,%rdx
+	jz	.Lcbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	.Lcbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	.Lcbc_enc_tail
+	subq	$16,%rdx
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+.Loop_enc1_6:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_6
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	.Lcbc_enc_loop
+	addq	$16,%rdx
+	jnz	.Lcbc_enc_tail
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%r8)
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	.Lcbc_enc_loop
+
+.align	16
+.Lcbc_decrypt:
+	cmpq	$16,%rdx
+	jne	.Lcbc_decrypt_bulk
+
+
+
+	movdqu	(%rdi),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	%xmm2,%xmm4
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_7:
+.byte	102,15,56,222,209
+	decl	%r10d
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_7
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqu	%xmm4,(%r8)
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lcbc_ret
+.align	16
+.Lcbc_decrypt_bulk:
+	leaq	(%rsp),%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbp
+.cfi_offset	%rbp,-16
+	subq	$16,%rsp
+	andq	$-16,%rsp
+	movq	%rcx,%rbp
+	movups	(%r8),%xmm10
+	movl	%r10d,%eax
+	cmpq	$0x50,%rdx
+	jbe	.Lcbc_dec_tail
+
+	movups	(%rcx),%xmm0
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+	cmpq	$0x70,%rdx
+	jbe	.Lcbc_dec_six_or_seven
+
+	subq	$0x70,%rdx
+	leaq	112(%rcx),%rcx
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+	movdqu	96(%rdi),%xmm8
+	pxor	%xmm0,%xmm2
+	movdqu	112(%rdi),%xmm9
+	pxor	%xmm0,%xmm3
+	movups	16-112(%rcx),%xmm1
+	pxor	%xmm0,%xmm4
+	movq	$-1,%rbp
+	cmpq	$0x70,%rdx
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm9
+	movups	32-112(%rcx),%xmm0
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+	adcq	$0,%rbp
+	andq	$128,%rbp
+.byte	102,68,15,56,222,201
+	addq	%rdi,%rbp
+	movups	48-112(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	64-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	80-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	96-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	112-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	128-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	144-112(%rcx),%xmm1
+	cmpl	$11,%eax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	160-112(%rcx),%xmm0
+	jb	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	176-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	192-112(%rcx),%xmm0
+	je	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	208-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	224-112(%rcx),%xmm0
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_done:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm10
+	pxor	%xmm0,%xmm11
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm12
+	pxor	%xmm0,%xmm13
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movdqu	80(%rdi),%xmm1
+
+.byte	102,65,15,56,223,210
+	movdqu	96(%rdi),%xmm10
+	pxor	%xmm0,%xmm1
+.byte	102,65,15,56,223,219
+	pxor	%xmm0,%xmm10
+	movdqu	112(%rdi),%xmm0
+.byte	102,65,15,56,223,228
+	leaq	128(%rdi),%rdi
+	movdqu	0(%rbp),%xmm11
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+	movdqu	16(%rbp),%xmm12
+	movdqu	32(%rbp),%xmm13
+.byte	102,65,15,56,223,255
+.byte	102,68,15,56,223,193
+	movdqu	48(%rbp),%xmm14
+	movdqu	64(%rbp),%xmm15
+.byte	102,69,15,56,223,202
+	movdqa	%xmm0,%xmm10
+	movdqu	80(%rbp),%xmm1
+	movups	-112(%rcx),%xmm0
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm1,%xmm7
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+
+	subq	$0x80,%rdx
+	ja	.Lcbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	leaq	-112(%rcx),%rcx
+	addq	$0x70,%rdx
+	jle	.Lcbc_dec_clear_tail_collected
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+	cmpq	$0x50,%rdx
+	jbe	.Lcbc_dec_tail
+
+	movaps	%xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+	cmpq	$0x60,%rdx
+	ja	.Lcbc_dec_seven
+
+	movaps	%xmm7,%xmm8
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	leaq	80(%rsi),%rsi
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm7,%xmm7
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_seven:
+	movups	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	80(%rdi),%xmm9
+	pxor	%xmm10,%xmm2
+	movups	96(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm9,%xmm8
+	movdqu	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	leaq	96(%rsi),%rsi
+	movdqa	%xmm8,%xmm2
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	.Lcbc_dec_tail_collected
+
+.Lcbc_dec_tail:
+	movups	(%rdi),%xmm2
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm2,%xmm11
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm3,%xmm12
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	movaps	%xmm4,%xmm13
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	movaps	%xmm5,%xmm14
+	movaps	%xmm6,%xmm15
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm15,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	leaq	64(%rsi),%rsi
+	movdqa	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	subq	$0x10,%rdx
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_one:
+	movaps	%xmm2,%xmm11
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_8:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_8
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movaps	%xmm11,%xmm10
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	movaps	%xmm3,%xmm12
+	call	_aesni_decrypt2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm12,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leaq	16(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	movaps	%xmm4,%xmm13
+	call	_aesni_decrypt3
+	pxor	%xmm10,%xmm2
+	movaps	%xmm13,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	leaq	32(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	movaps	%xmm5,%xmm14
+	call	_aesni_decrypt4
+	pxor	%xmm10,%xmm2
+	movaps	%xmm14,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	leaq	48(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+.Lcbc_dec_tail_collected:
+	movups	%xmm10,(%r8)
+	andq	$15,%rdx
+	jnz	.Lcbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	%xmm2,(%rsp)
+	pxor	%xmm2,%xmm2
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	(%rsp),%rsi
+.long	0x9066A4F3
+	movdqa	%xmm2,(%rsp)
+
+.Lcbc_dec_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lcbc_ret:
+	ret
+.cfi_endproc	
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,@function
+.align	16
+aes_hw_set_decrypt_key:
+.cfi_startproc	
+_CET_ENDBR
+.byte	0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset	8
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	.Ldec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+.Ldec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	.Ldec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	pxor	%xmm1,%xmm1
+	movups	%xmm0,(%rdi)
+	pxor	%xmm0,%xmm0
+.Ldec_key_ret:
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+.LSEH_end_set_decrypt_key:
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,@function
+.align	16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+.byte	0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset	8
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	.Lenc_key_ret
+	testq	%rdx,%rdx
+	jz	.Lenc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
+	movl	4(%r10),%r10d
+	andl	$268437504,%r10d
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+	cmpl	$192,%esi
+	je	.L12rounds
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+	cmpl	$268435456,%r10d
+	je	.L10rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L10rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	.Loop_key128
+
+	movdqa	.Lkey_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	cmpl	$268435456,%r10d
+	je	.L12rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds_alt:
+	movdqa	.Lkey_rotate192(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key192
+
+.align	16
+.Loop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	.Loop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	cmpl	$268435456,%r10d
+	je	.L14rounds_alt
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	.Ldone_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+.align	16
+.Lkey_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+.align	16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+.align	16
+.Lkey_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lxts_magic:
+.long	0x87,0,1,0
+.Lincrement1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long	1,1,1,1
+.Lkey_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
new file mode 100644
index 0000000..6c5d9ad
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -0,0 +1,2676 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+EXTERN	OPENSSL_ia32cap_P
+global	aes_hw_encrypt
+
+ALIGN	16
+aes_hw_encrypt:
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+1))],1
+%endif
+	movups	xmm2,XMMWORD[rcx]
+	mov	eax,DWORD[240+r8]
+	movups	xmm0,XMMWORD[r8]
+	movups	xmm1,XMMWORD[16+r8]
+	lea	r8,[32+r8]
+	xorps	xmm2,xmm0
+$L$oop_enc1_1:
+	DB	102,15,56,220,209
+	dec	eax
+	movups	xmm1,XMMWORD[r8]
+	lea	r8,[16+r8]
+	jnz	NEAR $L$oop_enc1_1
+	DB	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	XMMWORD[rdx],xmm2
+	pxor	xmm2,xmm2
+	ret
+
+
+
+global	aes_hw_decrypt
+
+ALIGN	16
+aes_hw_decrypt:
+
+_CET_ENDBR
+	movups	xmm2,XMMWORD[rcx]
+	mov	eax,DWORD[240+r8]
+	movups	xmm0,XMMWORD[r8]
+	movups	xmm1,XMMWORD[16+r8]
+	lea	r8,[32+r8]
+	xorps	xmm2,xmm0
+$L$oop_dec1_2:
+	DB	102,15,56,222,209
+	dec	eax
+	movups	xmm1,XMMWORD[r8]
+	lea	r8,[16+r8]
+	jnz	NEAR $L$oop_dec1_2
+	DB	102,15,56,223,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	XMMWORD[rdx],xmm2
+	pxor	xmm2,xmm2
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt2:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop2:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop2
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	ret
+
+
+
+ALIGN	16
+_aesni_decrypt2:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$dec_loop2:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$dec_loop2
+
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,223,208
+	DB	102,15,56,223,216
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt3:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop3:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop3
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	ret
+
+
+
+ALIGN	16
+_aesni_decrypt3:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$dec_loop3:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$dec_loop3
+
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,223,208
+	DB	102,15,56,223,216
+	DB	102,15,56,223,224
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt4:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	xorps	xmm5,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	0x0f,0x1f,0x00
+	add	rax,16
+
+$L$enc_loop4:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop4
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	ret
+
+
+
+ALIGN	16
+_aesni_decrypt4:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	xorps	xmm5,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	0x0f,0x1f,0x00
+	add	rax,16
+
+$L$dec_loop4:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$dec_loop4
+
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,223,208
+	DB	102,15,56,223,216
+	DB	102,15,56,223,224
+	DB	102,15,56,223,232
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt6:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	DB	102,15,56,220,209
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,217
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	DB	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop6_enter
+ALIGN	16
+$L$enc_loop6:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+$L$enc_loop6_enter:
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop6
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	DB	102,15,56,221,240
+	DB	102,15,56,221,248
+	ret
+
+
+
+ALIGN	16
+_aesni_decrypt6:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	DB	102,15,56,222,209
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,222,217
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	DB	102,15,56,222,225
+	pxor	xmm7,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$dec_loop6_enter
+ALIGN	16
+$L$dec_loop6:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+$L$dec_loop6_enter:
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$dec_loop6
+
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,15,56,223,208
+	DB	102,15,56,223,216
+	DB	102,15,56,223,224
+	DB	102,15,56,223,232
+	DB	102,15,56,223,240
+	DB	102,15,56,223,248
+	ret
+
+
+
+ALIGN	16
+_aesni_encrypt8:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,209
+	pxor	xmm7,xmm0
+	pxor	xmm8,xmm0
+	DB	102,15,56,220,217
+	pxor	xmm9,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop8_inner
+ALIGN	16
+$L$enc_loop8:
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+$L$enc_loop8_inner:
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+$L$enc_loop8_enter:
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop8
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	DB	102,15,56,221,208
+	DB	102,15,56,221,216
+	DB	102,15,56,221,224
+	DB	102,15,56,221,232
+	DB	102,15,56,221,240
+	DB	102,15,56,221,248
+	DB	102,68,15,56,221,192
+	DB	102,68,15,56,221,200
+	ret
+
+
+
+ALIGN	16
+_aesni_decrypt8:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,222,209
+	pxor	xmm7,xmm0
+	pxor	xmm8,xmm0
+	DB	102,15,56,222,217
+	pxor	xmm9,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$dec_loop8_inner
+ALIGN	16
+$L$dec_loop8:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+$L$dec_loop8_inner:
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+$L$dec_loop8_enter:
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$dec_loop8
+
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	DB	102,15,56,223,208
+	DB	102,15,56,223,216
+	DB	102,15,56,223,224
+	DB	102,15,56,223,232
+	DB	102,15,56,223,240
+	DB	102,15,56,223,248
+	DB	102,68,15,56,223,192
+	DB	102,68,15,56,223,200
+	ret
+
+
+global	aes_hw_ecb_encrypt
+
+ALIGN	16
+aes_hw_ecb_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes_hw_ecb_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-88))+rsp]
+	movaps	XMMWORD[rsp],xmm6
+	movaps	XMMWORD[16+rsp],xmm7
+	movaps	XMMWORD[32+rsp],xmm8
+	movaps	XMMWORD[48+rsp],xmm9
+$L$ecb_enc_body:
+	and	rdx,-16
+	jz	NEAR $L$ecb_ret
+
+	mov	eax,DWORD[240+rcx]
+	movups	xmm0,XMMWORD[rcx]
+	mov	r11,rcx
+	mov	r10d,eax
+	test	r8d,r8d
+	jz	NEAR $L$ecb_decrypt
+
+	cmp	rdx,0x80
+	jb	NEAR $L$ecb_enc_tail
+
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movdqu	xmm7,XMMWORD[80+rdi]
+	movdqu	xmm8,XMMWORD[96+rdi]
+	movdqu	xmm9,XMMWORD[112+rdi]
+	lea	rdi,[128+rdi]
+	sub	rdx,0x80
+	jmp	NEAR $L$ecb_enc_loop8_enter
+ALIGN	16
+$L$ecb_enc_loop8:
+	movups	XMMWORD[rsi],xmm2
+	mov	rcx,r11
+	movdqu	xmm2,XMMWORD[rdi]
+	mov	eax,r10d
+	movups	XMMWORD[16+rsi],xmm3
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movups	XMMWORD[32+rsi],xmm4
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movups	XMMWORD[48+rsi],xmm5
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movups	XMMWORD[64+rsi],xmm6
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movups	XMMWORD[80+rsi],xmm7
+	movdqu	xmm7,XMMWORD[80+rdi]
+	movups	XMMWORD[96+rsi],xmm8
+	movdqu	xmm8,XMMWORD[96+rdi]
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+	movdqu	xmm9,XMMWORD[112+rdi]
+	lea	rdi,[128+rdi]
+$L$ecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	sub	rdx,0x80
+	jnc	NEAR $L$ecb_enc_loop8
+
+	movups	XMMWORD[rsi],xmm2
+	mov	rcx,r11
+	movups	XMMWORD[16+rsi],xmm3
+	mov	eax,r10d
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	movups	XMMWORD[96+rsi],xmm8
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+	add	rdx,0x80
+	jz	NEAR $L$ecb_ret
+
+$L$ecb_enc_tail:
+	movups	xmm2,XMMWORD[rdi]
+	cmp	rdx,0x20
+	jb	NEAR $L$ecb_enc_one
+	movups	xmm3,XMMWORD[16+rdi]
+	je	NEAR $L$ecb_enc_two
+	movups	xmm4,XMMWORD[32+rdi]
+	cmp	rdx,0x40
+	jb	NEAR $L$ecb_enc_three
+	movups	xmm5,XMMWORD[48+rdi]
+	je	NEAR $L$ecb_enc_four
+	movups	xmm6,XMMWORD[64+rdi]
+	cmp	rdx,0x60
+	jb	NEAR $L$ecb_enc_five
+	movups	xmm7,XMMWORD[80+rdi]
+	je	NEAR $L$ecb_enc_six
+	movdqu	xmm8,XMMWORD[96+rdi]
+	xorps	xmm9,xmm9
+	call	_aesni_encrypt8
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	movups	XMMWORD[96+rsi],xmm8
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_one:
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_enc1_3:
+	DB	102,15,56,220,209
+	dec	eax
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_enc1_3
+	DB	102,15,56,221,209
+	movups	XMMWORD[rsi],xmm2
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_two:
+	call	_aesni_encrypt2
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_five:
+	xorps	xmm7,xmm7
+	call	_aesni_encrypt6
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_enc_six:
+	call	_aesni_encrypt6
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	jmp	NEAR $L$ecb_ret
+
+ALIGN	16
+$L$ecb_decrypt:
+	cmp	rdx,0x80
+	jb	NEAR $L$ecb_dec_tail
+
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movdqu	xmm7,XMMWORD[80+rdi]
+	movdqu	xmm8,XMMWORD[96+rdi]
+	movdqu	xmm9,XMMWORD[112+rdi]
+	lea	rdi,[128+rdi]
+	sub	rdx,0x80
+	jmp	NEAR $L$ecb_dec_loop8_enter
+ALIGN	16
+$L$ecb_dec_loop8:
+	movups	XMMWORD[rsi],xmm2
+	mov	rcx,r11
+	movdqu	xmm2,XMMWORD[rdi]
+	mov	eax,r10d
+	movups	XMMWORD[16+rsi],xmm3
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movups	XMMWORD[32+rsi],xmm4
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movups	XMMWORD[48+rsi],xmm5
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movups	XMMWORD[64+rsi],xmm6
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movups	XMMWORD[80+rsi],xmm7
+	movdqu	xmm7,XMMWORD[80+rdi]
+	movups	XMMWORD[96+rsi],xmm8
+	movdqu	xmm8,XMMWORD[96+rdi]
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+	movdqu	xmm9,XMMWORD[112+rdi]
+	lea	rdi,[128+rdi]
+$L$ecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	xmm0,XMMWORD[r11]
+	sub	rdx,0x80
+	jnc	NEAR $L$ecb_dec_loop8
+
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	mov	rcx,r11
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	mov	eax,r10d
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	pxor	xmm7,xmm7
+	movups	XMMWORD[96+rsi],xmm8
+	pxor	xmm8,xmm8
+	movups	XMMWORD[112+rsi],xmm9
+	pxor	xmm9,xmm9
+	lea	rsi,[128+rsi]
+	add	rdx,0x80
+	jz	NEAR $L$ecb_ret
+
+$L$ecb_dec_tail:
+	movups	xmm2,XMMWORD[rdi]
+	cmp	rdx,0x20
+	jb	NEAR $L$ecb_dec_one
+	movups	xmm3,XMMWORD[16+rdi]
+	je	NEAR $L$ecb_dec_two
+	movups	xmm4,XMMWORD[32+rdi]
+	cmp	rdx,0x40
+	jb	NEAR $L$ecb_dec_three
+	movups	xmm5,XMMWORD[48+rdi]
+	je	NEAR $L$ecb_dec_four
+	movups	xmm6,XMMWORD[64+rdi]
+	cmp	rdx,0x60
+	jb	NEAR $L$ecb_dec_five
+	movups	xmm7,XMMWORD[80+rdi]
+	je	NEAR $L$ecb_dec_six
+	movups	xmm8,XMMWORD[96+rdi]
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm9,xmm9
+	call	_aesni_decrypt8
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	pxor	xmm7,xmm7
+	movups	XMMWORD[96+rsi],xmm8
+	pxor	xmm8,xmm8
+	pxor	xmm9,xmm9
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_one:
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_dec1_4:
+	DB	102,15,56,222,209
+	dec	eax
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_dec1_4
+	DB	102,15,56,223,209
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_two:
+	call	_aesni_decrypt2
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_five:
+	xorps	xmm7,xmm7
+	call	_aesni_decrypt6
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	jmp	NEAR $L$ecb_ret
+ALIGN	16
+$L$ecb_dec_six:
+	call	_aesni_decrypt6
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	pxor	xmm7,xmm7
+
+$L$ecb_ret:
+	xorps	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	XMMWORD[rsp],xmm0
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	XMMWORD[48+rsp],xmm0
+	lea	rsp,[88+rsp]
+$L$ecb_enc_ret:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes_hw_ecb_encrypt:
+global	aes_hw_ctr32_encrypt_blocks
+
+ALIGN	16
+aes_hw_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes_hw_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[BORINGSSL_function_hit],1
+%endif
+	cmp	rdx,1
+	jne	NEAR $L$ctr32_bulk
+
+
+
+	movups	xmm2,XMMWORD[r8]
+	movups	xmm3,XMMWORD[rdi]
+	mov	edx,DWORD[240+rcx]
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_enc1_5:
+	DB	102,15,56,220,209
+	dec	edx
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_enc1_5
+	DB	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	xorps	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm2,xmm2
+	jmp	NEAR $L$ctr32_epilogue
+
+ALIGN	16
+$L$ctr32_bulk:
+	lea	r11,[rsp]
+
+	push	rbp
+
+	sub	rsp,288
+	and	rsp,-16
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
+$L$ctr32_body:
+
+
+
+
+	movdqu	xmm2,XMMWORD[r8]
+	movdqu	xmm0,XMMWORD[rcx]
+	mov	r8d,DWORD[12+r8]
+	pxor	xmm2,xmm0
+	mov	ebp,DWORD[12+rcx]
+	movdqa	XMMWORD[rsp],xmm2
+	bswap	r8d
+	movdqa	xmm3,xmm2
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm2
+	movdqa	XMMWORD[64+rsp],xmm2
+	movdqa	XMMWORD[80+rsp],xmm2
+	movdqa	XMMWORD[96+rsp],xmm2
+	mov	r10,rdx
+	movdqa	XMMWORD[112+rsp],xmm2
+
+	lea	rax,[1+r8]
+	lea	rdx,[2+r8]
+	bswap	eax
+	bswap	edx
+	xor	eax,ebp
+	xor	edx,ebp
+DB	102,15,58,34,216,3
+	lea	rax,[3+r8]
+	movdqa	XMMWORD[16+rsp],xmm3
+DB	102,15,58,34,226,3
+	bswap	eax
+	mov	rdx,r10
+	lea	r10,[4+r8]
+	movdqa	XMMWORD[32+rsp],xmm4
+	xor	eax,ebp
+	bswap	r10d
+DB	102,15,58,34,232,3
+	xor	r10d,ebp
+	movdqa	XMMWORD[48+rsp],xmm5
+	lea	r9,[5+r8]
+	mov	DWORD[((64+12))+rsp],r10d
+	bswap	r9d
+	lea	r10,[6+r8]
+	mov	eax,DWORD[240+rcx]
+	xor	r9d,ebp
+	bswap	r10d
+	mov	DWORD[((80+12))+rsp],r9d
+	xor	r10d,ebp
+	lea	r9,[7+r8]
+	mov	DWORD[((96+12))+rsp],r10d
+	bswap	r9d
+	xor	r9d,ebp
+	mov	DWORD[((112+12))+rsp],r9d
+
+	movups	xmm1,XMMWORD[16+rcx]
+
+	movdqa	xmm6,XMMWORD[64+rsp]
+	movdqa	xmm7,XMMWORD[80+rsp]
+
+	cmp	rdx,8
+	jb	NEAR $L$ctr32_tail
+
+	lea	rcx,[128+rcx]
+	sub	rdx,8
+	jmp	NEAR $L$ctr32_loop8
+
+ALIGN	32
+$L$ctr32_loop8:
+	add	r8d,8
+	movdqa	xmm8,XMMWORD[96+rsp]
+	DB	102,15,56,220,209
+	mov	r9d,r8d
+	movdqa	xmm9,XMMWORD[112+rsp]
+	DB	102,15,56,220,217
+	bswap	r9d
+	movups	xmm0,XMMWORD[((32-128))+rcx]
+	DB	102,15,56,220,225
+	xor	r9d,ebp
+	nop
+	DB	102,15,56,220,233
+	mov	DWORD[((0+12))+rsp],r9d
+	lea	r9,[1+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((48-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((16+12))+rsp],r9d
+	lea	r9,[2+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((64-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((32+12))+rsp],r9d
+	lea	r9,[3+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((80-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((48+12))+rsp],r9d
+	lea	r9,[4+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((96-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((64+12))+rsp],r9d
+	lea	r9,[5+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((112-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	mov	DWORD[((80+12))+rsp],r9d
+	lea	r9,[6+r8]
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((128-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	xor	r9d,ebp
+	DB	0x66,0x90
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	mov	DWORD[((96+12))+rsp],r9d
+	lea	r9,[7+r8]
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((144-128))+rcx]
+	bswap	r9d
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	xor	r9d,ebp
+	movdqu	xmm10,XMMWORD[rdi]
+	DB	102,15,56,220,232
+	mov	DWORD[((112+12))+rsp],r9d
+	cmp	eax,11
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((160-128))+rcx]
+
+	jb	NEAR $L$ctr32_enc_done
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((176-128))+rcx]
+
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((192-128))+rcx]
+	je	NEAR $L$ctr32_enc_done
+
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((208-128))+rcx]
+
+	DB	102,15,56,220,208
+	DB	102,15,56,220,216
+	DB	102,15,56,220,224
+	DB	102,15,56,220,232
+	DB	102,15,56,220,240
+	DB	102,15,56,220,248
+	DB	102,68,15,56,220,192
+	DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((224-128))+rcx]
+	jmp	NEAR $L$ctr32_enc_done
+
+ALIGN	16
+$L$ctr32_enc_done:
+	movdqu	xmm11,XMMWORD[16+rdi]
+	pxor	xmm10,xmm0
+	movdqu	xmm12,XMMWORD[32+rdi]
+	pxor	xmm11,xmm0
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm12,xmm0
+	movdqu	xmm14,XMMWORD[64+rdi]
+	pxor	xmm13,xmm0
+	movdqu	xmm15,XMMWORD[80+rdi]
+	pxor	xmm14,xmm0
+	prefetcht0	[448+rdi]
+	prefetcht0	[512+rdi]
+	pxor	xmm15,xmm0
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+	DB	102,68,15,56,220,201
+	movdqu	xmm1,XMMWORD[96+rdi]
+	lea	rdi,[128+rdi]
+
+	DB	102,65,15,56,221,210
+	pxor	xmm1,xmm0
+	movdqu	xmm10,XMMWORD[((112-128))+rdi]
+	DB	102,65,15,56,221,219
+	pxor	xmm10,xmm0
+	movdqa	xmm11,XMMWORD[rsp]
+	DB	102,65,15,56,221,228
+	DB	102,65,15,56,221,237
+	movdqa	xmm12,XMMWORD[16+rsp]
+	movdqa	xmm13,XMMWORD[32+rsp]
+	DB	102,65,15,56,221,246
+	DB	102,65,15,56,221,255
+	movdqa	xmm14,XMMWORD[48+rsp]
+	movdqa	xmm15,XMMWORD[64+rsp]
+	DB	102,68,15,56,221,193
+	movdqa	xmm0,XMMWORD[80+rsp]
+	movups	xmm1,XMMWORD[((16-128))+rcx]
+	DB	102,69,15,56,221,202
+
+	movups	XMMWORD[rsi],xmm2
+	movdqa	xmm2,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	movdqa	xmm3,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+	movdqa	xmm4,xmm13
+	movups	XMMWORD[48+rsi],xmm5
+	movdqa	xmm5,xmm14
+	movups	XMMWORD[64+rsi],xmm6
+	movdqa	xmm6,xmm15
+	movups	XMMWORD[80+rsi],xmm7
+	movdqa	xmm7,xmm0
+	movups	XMMWORD[96+rsi],xmm8
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+
+	sub	rdx,8
+	jnc	NEAR $L$ctr32_loop8
+
+	add	rdx,8
+	jz	NEAR $L$ctr32_done
+	lea	rcx,[((-128))+rcx]
+
+$L$ctr32_tail:
+
+
+	lea	rcx,[16+rcx]
+	cmp	rdx,4
+	jb	NEAR $L$ctr32_loop3
+	je	NEAR $L$ctr32_loop4
+
+
+	shl	eax,4
+	movdqa	xmm8,XMMWORD[96+rsp]
+	pxor	xmm9,xmm9
+
+	movups	xmm0,XMMWORD[16+rcx]
+	DB	102,15,56,220,209
+	DB	102,15,56,220,217
+	lea	rcx,[((32-16))+rax*1+rcx]
+	neg	rax
+	DB	102,15,56,220,225
+	add	rax,16
+	movups	xmm10,XMMWORD[rdi]
+	DB	102,15,56,220,233
+	DB	102,15,56,220,241
+	movups	xmm11,XMMWORD[16+rdi]
+	movups	xmm12,XMMWORD[32+rdi]
+	DB	102,15,56,220,249
+	DB	102,68,15,56,220,193
+
+	call	$L$enc_loop8_enter
+
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm2,xmm10
+	movdqu	xmm10,XMMWORD[64+rdi]
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm6,xmm10
+	movdqu	XMMWORD[48+rsi],xmm5
+	movdqu	XMMWORD[64+rsi],xmm6
+	cmp	rdx,6
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[80+rdi]
+	xorps	xmm7,xmm11
+	movups	XMMWORD[80+rsi],xmm7
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[96+rdi]
+	xorps	xmm8,xmm12
+	movups	XMMWORD[96+rsi],xmm8
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop4:
+	DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop4
+	DB	102,15,56,221,209
+	DB	102,15,56,221,217
+	movups	xmm10,XMMWORD[rdi]
+	movups	xmm11,XMMWORD[16+rdi]
+	DB	102,15,56,221,225
+	DB	102,15,56,221,233
+	movups	xmm12,XMMWORD[32+rdi]
+	movups	xmm13,XMMWORD[48+rdi]
+
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[48+rsi],xmm5
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop3:
+	DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+	DB	102,15,56,220,217
+	DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop3
+	DB	102,15,56,221,209
+	DB	102,15,56,221,217
+	DB	102,15,56,221,225
+
+	movups	xmm10,XMMWORD[rdi]
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	cmp	rdx,2
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[16+rdi]
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[32+rdi]
+	xorps	xmm4,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+
+$L$ctr32_done:
+	xorps	xmm0,xmm0
+	xor	ebp,ebp
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
+	movaps	XMMWORD[rsp],xmm0
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	XMMWORD[112+rsp],xmm0
+	mov	rbp,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$ctr32_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes_hw_ctr32_encrypt_blocks:
+global	aes_hw_cbc_encrypt
+
+ALIGN	16
+aes_hw_cbc_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes_hw_cbc_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	test	rdx,rdx
+	jz	NEAR $L$cbc_ret
+
+	mov	r10d,DWORD[240+rcx]
+	mov	r11,rcx
+	test	r9d,r9d
+	jz	NEAR $L$cbc_decrypt
+
+	movups	xmm2,XMMWORD[r8]
+	mov	eax,r10d
+	cmp	rdx,16
+	jb	NEAR $L$cbc_enc_tail
+	sub	rdx,16
+	jmp	NEAR $L$cbc_enc_loop
+ALIGN	16
+$L$cbc_enc_loop:
+	movups	xmm3,XMMWORD[rdi]
+	lea	rdi,[16+rdi]
+
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm3,xmm0
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm3
+$L$oop_enc1_6:
+	DB	102,15,56,220,209
+	dec	eax
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_enc1_6
+	DB	102,15,56,221,209
+	mov	eax,r10d
+	mov	rcx,r11
+	movups	XMMWORD[rsi],xmm2
+	lea	rsi,[16+rsi]
+	sub	rdx,16
+	jnc	NEAR $L$cbc_enc_loop
+	add	rdx,16
+	jnz	NEAR $L$cbc_enc_tail
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	XMMWORD[r8],xmm2
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	jmp	NEAR $L$cbc_ret
+
+$L$cbc_enc_tail:
+	mov	rcx,rdx
+	xchg	rsi,rdi
+	DD	0x9066A4F3
+	mov	ecx,16
+	sub	rcx,rdx
+	xor	eax,eax
+	DD	0x9066AAF3
+	lea	rdi,[((-16))+rdi]
+	mov	eax,r10d
+	mov	rsi,rdi
+	mov	rcx,r11
+	xor	rdx,rdx
+	jmp	NEAR $L$cbc_enc_loop
+
+ALIGN	16
+$L$cbc_decrypt:
+	cmp	rdx,16
+	jne	NEAR $L$cbc_decrypt_bulk
+
+
+
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm4,xmm2
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_dec1_7:
+	DB	102,15,56,222,209
+	dec	r10d
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_dec1_7
+	DB	102,15,56,223,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movdqu	XMMWORD[r8],xmm4
+	xorps	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	jmp	NEAR $L$cbc_ret
+ALIGN	16
+$L$cbc_decrypt_bulk:
+	lea	r11,[rsp]
+
+	push	rbp
+
+	sub	rsp,176
+	and	rsp,-16
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$cbc_decrypt_body:
+	mov	rbp,rcx
+	movups	xmm10,XMMWORD[r8]
+	mov	eax,r10d
+	cmp	rdx,0x50
+	jbe	NEAR $L$cbc_dec_tail
+
+	movups	xmm0,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movdqa	xmm11,xmm2
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movdqa	xmm12,xmm3
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movdqa	xmm13,xmm4
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movdqa	xmm14,xmm5
+	movdqu	xmm7,XMMWORD[80+rdi]
+	movdqa	xmm15,xmm6
+	cmp	rdx,0x70
+	jbe	NEAR $L$cbc_dec_six_or_seven
+
+	sub	rdx,0x70
+	lea	rcx,[112+rcx]
+	jmp	NEAR $L$cbc_dec_loop8_enter
+ALIGN	16
+$L$cbc_dec_loop8:
+	movups	XMMWORD[rsi],xmm9
+	lea	rsi,[16+rsi]
+$L$cbc_dec_loop8_enter:
+	movdqu	xmm8,XMMWORD[96+rdi]
+	pxor	xmm2,xmm0
+	movdqu	xmm9,XMMWORD[112+rdi]
+	pxor	xmm3,xmm0
+	movups	xmm1,XMMWORD[((16-112))+rcx]
+	pxor	xmm4,xmm0
+	mov	rbp,-1
+	cmp	rdx,0x70
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	pxor	xmm7,xmm0
+	pxor	xmm8,xmm0
+
+	DB	102,15,56,222,209
+	pxor	xmm9,xmm0
+	movups	xmm0,XMMWORD[((32-112))+rcx]
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	adc	rbp,0
+	and	rbp,128
+	DB	102,68,15,56,222,201
+	add	rbp,rdi
+	movups	xmm1,XMMWORD[((48-112))+rcx]
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((64-112))+rcx]
+	nop
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movups	xmm1,XMMWORD[((80-112))+rcx]
+	nop
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((96-112))+rcx]
+	nop
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movups	xmm1,XMMWORD[((112-112))+rcx]
+	nop
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((128-112))+rcx]
+	nop
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movups	xmm1,XMMWORD[((144-112))+rcx]
+	cmp	eax,11
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((160-112))+rcx]
+	jb	NEAR $L$cbc_dec_done
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movups	xmm1,XMMWORD[((176-112))+rcx]
+	nop
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((192-112))+rcx]
+	je	NEAR $L$cbc_dec_done
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movups	xmm1,XMMWORD[((208-112))+rcx]
+	nop
+	DB	102,15,56,222,208
+	DB	102,15,56,222,216
+	DB	102,15,56,222,224
+	DB	102,15,56,222,232
+	DB	102,15,56,222,240
+	DB	102,15,56,222,248
+	DB	102,68,15,56,222,192
+	DB	102,68,15,56,222,200
+	movups	xmm0,XMMWORD[((224-112))+rcx]
+	jmp	NEAR $L$cbc_dec_done
+ALIGN	16
+$L$cbc_dec_done:
+	DB	102,15,56,222,209
+	DB	102,15,56,222,217
+	pxor	xmm10,xmm0
+	pxor	xmm11,xmm0
+	DB	102,15,56,222,225
+	DB	102,15,56,222,233
+	pxor	xmm12,xmm0
+	pxor	xmm13,xmm0
+	DB	102,15,56,222,241
+	DB	102,15,56,222,249
+	pxor	xmm14,xmm0
+	pxor	xmm15,xmm0
+	DB	102,68,15,56,222,193
+	DB	102,68,15,56,222,201
+	movdqu	xmm1,XMMWORD[80+rdi]
+
+	DB	102,65,15,56,223,210
+	movdqu	xmm10,XMMWORD[96+rdi]
+	pxor	xmm1,xmm0
+	DB	102,65,15,56,223,219
+	pxor	xmm10,xmm0
+	movdqu	xmm0,XMMWORD[112+rdi]
+	DB	102,65,15,56,223,228
+	lea	rdi,[128+rdi]
+	movdqu	xmm11,XMMWORD[rbp]
+	DB	102,65,15,56,223,237
+	DB	102,65,15,56,223,246
+	movdqu	xmm12,XMMWORD[16+rbp]
+	movdqu	xmm13,XMMWORD[32+rbp]
+	DB	102,65,15,56,223,255
+	DB	102,68,15,56,223,193
+	movdqu	xmm14,XMMWORD[48+rbp]
+	movdqu	xmm15,XMMWORD[64+rbp]
+	DB	102,69,15,56,223,202
+	movdqa	xmm10,xmm0
+	movdqu	xmm1,XMMWORD[80+rbp]
+	movups	xmm0,XMMWORD[((-112))+rcx]
+
+	movups	XMMWORD[rsi],xmm2
+	movdqa	xmm2,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	movdqa	xmm3,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+	movdqa	xmm4,xmm13
+	movups	XMMWORD[48+rsi],xmm5
+	movdqa	xmm5,xmm14
+	movups	XMMWORD[64+rsi],xmm6
+	movdqa	xmm6,xmm15
+	movups	XMMWORD[80+rsi],xmm7
+	movdqa	xmm7,xmm1
+	movups	XMMWORD[96+rsi],xmm8
+	lea	rsi,[112+rsi]
+
+	sub	rdx,0x80
+	ja	NEAR $L$cbc_dec_loop8
+
+	movaps	xmm2,xmm9
+	lea	rcx,[((-112))+rcx]
+	add	rdx,0x70
+	jle	NEAR $L$cbc_dec_clear_tail_collected
+	movups	XMMWORD[rsi],xmm9
+	lea	rsi,[16+rsi]
+	cmp	rdx,0x50
+	jbe	NEAR $L$cbc_dec_tail
+
+	movaps	xmm2,xmm11
+$L$cbc_dec_six_or_seven:
+	cmp	rdx,0x60
+	ja	NEAR $L$cbc_dec_seven
+
+	movaps	xmm8,xmm7
+	call	_aesni_decrypt6
+	pxor	xmm2,xmm10
+	movaps	xmm10,xmm8
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	pxor	xmm6,xmm14
+	movdqu	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	pxor	xmm7,xmm15
+	movdqu	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	lea	rsi,[80+rsi]
+	movdqa	xmm2,xmm7
+	pxor	xmm7,xmm7
+	jmp	NEAR $L$cbc_dec_tail_collected
+
+ALIGN	16
+$L$cbc_dec_seven:
+	movups	xmm8,XMMWORD[96+rdi]
+	xorps	xmm9,xmm9
+	call	_aesni_decrypt8
+	movups	xmm9,XMMWORD[80+rdi]
+	pxor	xmm2,xmm10
+	movups	xmm10,XMMWORD[96+rdi]
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	pxor	xmm6,xmm14
+	movdqu	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	pxor	xmm7,xmm15
+	movdqu	XMMWORD[64+rsi],xmm6
+	pxor	xmm6,xmm6
+	pxor	xmm8,xmm9
+	movdqu	XMMWORD[80+rsi],xmm7
+	pxor	xmm7,xmm7
+	lea	rsi,[96+rsi]
+	movdqa	xmm2,xmm8
+	pxor	xmm8,xmm8
+	pxor	xmm9,xmm9
+	jmp	NEAR $L$cbc_dec_tail_collected
+
+$L$cbc_dec_tail:
+	movups	xmm2,XMMWORD[rdi]
+	sub	rdx,0x10
+	jbe	NEAR $L$cbc_dec_one
+
+	movups	xmm3,XMMWORD[16+rdi]
+	movaps	xmm11,xmm2
+	sub	rdx,0x10
+	jbe	NEAR $L$cbc_dec_two
+
+	movups	xmm4,XMMWORD[32+rdi]
+	movaps	xmm12,xmm3
+	sub	rdx,0x10
+	jbe	NEAR $L$cbc_dec_three
+
+	movups	xmm5,XMMWORD[48+rdi]
+	movaps	xmm13,xmm4
+	sub	rdx,0x10
+	jbe	NEAR $L$cbc_dec_four
+
+	movups	xmm6,XMMWORD[64+rdi]
+	movaps	xmm14,xmm5
+	movaps	xmm15,xmm6
+	xorps	xmm7,xmm7
+	call	_aesni_decrypt6
+	pxor	xmm2,xmm10
+	movaps	xmm10,xmm15
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	pxor	xmm6,xmm14
+	movdqu	XMMWORD[48+rsi],xmm5
+	pxor	xmm5,xmm5
+	lea	rsi,[64+rsi]
+	movdqa	xmm2,xmm6
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	sub	rdx,0x10
+	jmp	NEAR $L$cbc_dec_tail_collected
+
+ALIGN	16
+$L$cbc_dec_one:
+	movaps	xmm11,xmm2
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_dec1_8:
+	DB	102,15,56,222,209
+	dec	eax
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_dec1_8
+	DB	102,15,56,223,209
+	xorps	xmm2,xmm10
+	movaps	xmm10,xmm11
+	jmp	NEAR $L$cbc_dec_tail_collected
+ALIGN	16
+$L$cbc_dec_two:
+	movaps	xmm12,xmm3
+	call	_aesni_decrypt2
+	pxor	xmm2,xmm10
+	movaps	xmm10,xmm12
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	movdqa	xmm2,xmm3
+	pxor	xmm3,xmm3
+	lea	rsi,[16+rsi]
+	jmp	NEAR $L$cbc_dec_tail_collected
+ALIGN	16
+$L$cbc_dec_three:
+	movaps	xmm13,xmm4
+	call	_aesni_decrypt3
+	pxor	xmm2,xmm10
+	movaps	xmm10,xmm13
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	movdqa	xmm2,xmm4
+	pxor	xmm4,xmm4
+	lea	rsi,[32+rsi]
+	jmp	NEAR $L$cbc_dec_tail_collected
+ALIGN	16
+$L$cbc_dec_four:
+	movaps	xmm14,xmm5
+	call	_aesni_decrypt4
+	pxor	xmm2,xmm10
+	movaps	xmm10,xmm14
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm3,xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm4,xmm4
+	movdqa	xmm2,xmm5
+	pxor	xmm5,xmm5
+	lea	rsi,[48+rsi]
+	jmp	NEAR $L$cbc_dec_tail_collected
+
+ALIGN	16
+$L$cbc_dec_clear_tail_collected:
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+$L$cbc_dec_tail_collected:
+	movups	XMMWORD[r8],xmm10
+	and	rdx,15
+	jnz	NEAR $L$cbc_dec_tail_partial
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm2,xmm2
+	jmp	NEAR $L$cbc_dec_ret
+ALIGN	16
+$L$cbc_dec_tail_partial:
+	movaps	XMMWORD[rsp],xmm2
+	pxor	xmm2,xmm2
+	mov	rcx,16
+	mov	rdi,rsi
+	sub	rcx,rdx
+	lea	rsi,[rsp]
+	DD	0x9066A4F3
+	movdqa	XMMWORD[rsp],xmm2
+
+$L$cbc_dec_ret:
+	xorps	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	XMMWORD[112+rsp],xmm0
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	XMMWORD[128+rsp],xmm0
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	XMMWORD[144+rsp],xmm0
+	movaps	xmm15,XMMWORD[160+rsp]
+	movaps	XMMWORD[160+rsp],xmm0
+	mov	rbp,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$cbc_ret:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes_hw_cbc_encrypt:
+global	aes_hw_set_decrypt_key
+
+ALIGN	16
+aes_hw_set_decrypt_key:
+
+_CET_ENDBR
+	DB	0x48,0x83,0xEC,0x08
+
+	call	__aesni_set_encrypt_key
+	shl	edx,4
+	test	eax,eax
+	jnz	NEAR $L$dec_key_ret
+	lea	rcx,[16+rdx*1+r8]
+
+	movups	xmm0,XMMWORD[r8]
+	movups	xmm1,XMMWORD[rcx]
+	movups	XMMWORD[rcx],xmm0
+	movups	XMMWORD[r8],xmm1
+	lea	r8,[16+r8]
+	lea	rcx,[((-16))+rcx]
+
+$L$dec_key_inverse:
+	movups	xmm0,XMMWORD[r8]
+	movups	xmm1,XMMWORD[rcx]
+	DB	102,15,56,219,192
+	DB	102,15,56,219,201
+	lea	r8,[16+r8]
+	lea	rcx,[((-16))+rcx]
+	movups	XMMWORD[16+rcx],xmm0
+	movups	XMMWORD[(-16)+r8],xmm1
+	cmp	rcx,r8
+	ja	NEAR $L$dec_key_inverse
+
+	movups	xmm0,XMMWORD[r8]
+	DB	102,15,56,219,192
+	pxor	xmm1,xmm1
+	movups	XMMWORD[rcx],xmm0
+	pxor	xmm0,xmm0
+$L$dec_key_ret:
+	add	rsp,8
+
+	ret
+
+$L$SEH_end_set_decrypt_key:
+
+global	aes_hw_set_encrypt_key
+
+ALIGN	16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+	DB	0x48,0x83,0xEC,0x08
+
+	mov	rax,-1
+	test	rcx,rcx
+	jz	NEAR $L$enc_key_ret
+	test	r8,r8
+	jz	NEAR $L$enc_key_ret
+
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm4,xmm4
+	lea	r10,[OPENSSL_ia32cap_P]
+	mov	r10d,DWORD[4+r10]
+	and	r10d,268437504
+	lea	rax,[16+r8]
+	cmp	edx,256
+	je	NEAR $L$14rounds
+	cmp	edx,192
+	je	NEAR $L$12rounds
+	cmp	edx,128
+	jne	NEAR $L$bad_keybits
+
+$L$10rounds:
+	mov	edx,9
+	cmp	r10d,268435456
+	je	NEAR $L$10rounds_alt
+
+	movups	XMMWORD[r8],xmm0
+	DB	102,15,58,223,200,1
+	call	$L$key_expansion_128_cold
+	DB	102,15,58,223,200,2
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,4
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,8
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,16
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,32
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,64
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,128
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,27
+	call	$L$key_expansion_128
+	DB	102,15,58,223,200,54
+	call	$L$key_expansion_128
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[80+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$10rounds_alt:
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	mov	r10d,8
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	movdqa	xmm2,xmm0
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key128
+
+ALIGN	16
+$L$oop_key128:
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+	lea	rax,[16+rax]
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[(-16)+rax],xmm0
+	movdqa	xmm2,xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key128
+
+	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
+
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	movdqa	xmm2,xmm0
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[16+rax],xmm0
+
+	mov	DWORD[96+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$12rounds:
+	movq	xmm2,QWORD[16+rcx]
+	mov	edx,11
+	cmp	r10d,268435456
+	je	NEAR $L$12rounds_alt
+
+	movups	XMMWORD[r8],xmm0
+	DB	102,15,58,223,202,1
+	call	$L$key_expansion_192a_cold
+	DB	102,15,58,223,202,2
+	call	$L$key_expansion_192b
+	DB	102,15,58,223,202,4
+	call	$L$key_expansion_192a
+	DB	102,15,58,223,202,8
+	call	$L$key_expansion_192b
+	DB	102,15,58,223,202,16
+	call	$L$key_expansion_192a
+	DB	102,15,58,223,202,32
+	call	$L$key_expansion_192b
+	DB	102,15,58,223,202,64
+	call	$L$key_expansion_192a
+	DB	102,15,58,223,202,128
+	call	$L$key_expansion_192b
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[48+rax],edx
+	xor	rax,rax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$12rounds_alt:
+	movdqa	xmm5,XMMWORD[$L$key_rotate192]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,8
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key192
+
+ALIGN	16
+$L$oop_key192:
+	movq	QWORD[rax],xmm2
+	movdqa	xmm1,xmm2
+DB	102,15,56,0,213
+	DB	102,15,56,221,212
+	pslld	xmm4,1
+	lea	rax,[24+rax]
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+
+	pshufd	xmm3,xmm0,0xff
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+
+	pxor	xmm0,xmm2
+	pxor	xmm2,xmm3
+	movdqu	XMMWORD[(-16)+rax],xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key192
+
+	mov	DWORD[32+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$14rounds:
+	movups	xmm2,XMMWORD[16+rcx]
+	mov	edx,13
+	lea	rax,[16+rax]
+	cmp	r10d,268435456
+	je	NEAR $L$14rounds_alt
+
+	movups	XMMWORD[r8],xmm0
+	movups	XMMWORD[16+r8],xmm2
+	DB	102,15,58,223,202,1
+	call	$L$key_expansion_256a_cold
+	DB	102,15,58,223,200,1
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,2
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,2
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,4
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,4
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,8
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,8
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,16
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,16
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,32
+	call	$L$key_expansion_256a
+	DB	102,15,58,223,200,32
+	call	$L$key_expansion_256b
+	DB	102,15,58,223,202,64
+	call	$L$key_expansion_256a
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[16+rax],edx
+	xor	rax,rax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$14rounds_alt:
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,7
+	movdqu	XMMWORD[r8],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	XMMWORD[16+r8],xmm2
+	jmp	NEAR $L$oop_key256
+
+ALIGN	16
+$L$oop_key256:
+DB	102,15,56,0,213
+	DB	102,15,56,221,212
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	dec	r10d
+	jz	NEAR $L$done_key256
+
+	pshufd	xmm2,xmm0,0xff
+	pxor	xmm3,xmm3
+	DB	102,15,56,221,211
+
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+
+	pxor	xmm2,xmm1
+	movdqu	XMMWORD[16+rax],xmm2
+	lea	rax,[32+rax]
+	movdqa	xmm1,xmm2
+
+	jmp	NEAR $L$oop_key256
+
+$L$done_key256:
+	mov	DWORD[16+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$bad_keybits:
+	mov	rax,-2
+$L$enc_key_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	add	rsp,8
+
+	ret
+
+$L$SEH_end_set_encrypt_key:
+
+ALIGN	16
+$L$key_expansion_128:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+$L$key_expansion_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+
+ALIGN	16
+$L$key_expansion_192a:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+$L$key_expansion_192a_cold:
+	movaps	xmm5,xmm2
+$L$key_expansion_192b_warm:
+	shufps	xmm4,xmm0,16
+	movdqa	xmm3,xmm2
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	pslldq	xmm3,4
+	xorps	xmm0,xmm4
+	pshufd	xmm1,xmm1,85
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm0,255
+	pxor	xmm2,xmm3
+	ret
+
+ALIGN	16
+$L$key_expansion_192b:
+	movaps	xmm3,xmm0
+	shufps	xmm5,xmm0,68
+	movups	XMMWORD[rax],xmm5
+	shufps	xmm3,xmm2,78
+	movups	XMMWORD[16+rax],xmm3
+	lea	rax,[32+rax]
+	jmp	NEAR $L$key_expansion_192b_warm
+
+ALIGN	16
+$L$key_expansion_256a:
+	movups	XMMWORD[rax],xmm2
+	lea	rax,[16+rax]
+$L$key_expansion_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+
+ALIGN	16
+$L$key_expansion_256b:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$increment32:
+	DD	6,6,6,0
+$L$increment64:
+	DD	1,0,0,0
+$L$xts_magic:
+	DD	0x87,0,1,0
+$L$increment1:
+	DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate:
+	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+$L$key_rotate192:
+	DD	0x04070605,0x04070605,0x04070605,0x04070605
+$L$key_rcon1:
+	DD	1,1,1,1
+$L$key_rcon1b:
+	DD	0x1b,0x1b,0x1b,0x1b
+
+	DB	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+	DB	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+	DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+	DB	115,108,46,111,114,103,62,0
+ALIGN	64
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+ecb_ccm64_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[rax]
+	lea	rdi,[512+r8]
+	mov	ecx,8
+	DD	0xa548f3fc
+	lea	rax,[88+rax]
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+ctr_xts_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[208+r8]
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+cbc_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[152+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$cbc_decrypt_bulk]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[120+r8]
+
+	lea	r10,[$L$cbc_decrypt_body]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$cbc_ret]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[16+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	mov	rax,QWORD[208+r8]
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_aes_hw_ecb_encrypt wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase
+	DD	$L$SEH_info_ecb wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_ctr32 wrt ..imagebase
+	DD	$L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase
+	DD	$L$SEH_info_cbc wrt ..imagebase
+
+	DD	aes_hw_set_decrypt_key wrt ..imagebase
+	DD	$L$SEH_end_set_decrypt_key wrt ..imagebase
+	DD	$L$SEH_info_key wrt ..imagebase
+
+	DD	aes_hw_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_end_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_info_key wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ecb:
+	DB	9,0,0,0
+	DD	ecb_ccm64_se_handler wrt ..imagebase
+	DD	$L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase
+$L$SEH_info_ctr32:
+	DB	9,0,0,0
+	DD	ctr_xts_se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+$L$SEH_info_cbc:
+	DB	9,0,0,0
+	DD	cbc_se_handler wrt ..imagebase
+$L$SEH_info_key:
+	DB	0x01,0x04,0x01,0x00
+	DB	0x04,0x02,0x00,0x00
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesv8-armv7-linux.S b/gen/bcm/aesv8-armv7-linux.S
new file mode 100644
index 0000000..420af9b
--- /dev/null
+++ b/gen/bcm/aesv8-armv7-linux.S
@@ -0,0 +1,789 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv7-a	@ don't confuse not-so-latest binutils with argv8 :-)
+.fpu	neon
+.code	32
+#undef	__thumb2__
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,%function
+.align	5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+	mov	r3,#-1
+	cmp	r0,#0
+	beq	.Lenc_key_abort
+	cmp	r2,#0
+	beq	.Lenc_key_abort
+	mov	r3,#-2
+	cmp	r1,#128
+	blt	.Lenc_key_abort
+	cmp	r1,#256
+	bgt	.Lenc_key_abort
+	tst	r1,#0x3f
+	bne	.Lenc_key_abort
+
+	adr	r3,.Lrcon
+	cmp	r1,#192
+
+	veor	q0,q0,q0
+	vld1.8	{q3},[r0]!
+	mov	r1,#8		@ reuse r1
+	vld1.32	{q1,q2},[r3]!
+
+	blt	.Loop128
+	beq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	bne	.Loop128
+
+	vld1.32	{q1},[r3]
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]
+	add	r2,r2,#0x50
+
+	mov	r12,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	vld1.8	{d16},[r0]!
+	vmov.i8	q10,#8			@ borrow q10
+	vst1.32	{q3},[r2]!
+	vsub.i8	q2,q2,q10	@ adjust the mask
+
+.Loop192:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{d16},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+
+	vdup.32	q9,d7[1]
+	veor	q9,q9,q8
+	veor	q10,q10,q1
+	vext.8	q8,q0,q8,#12
+	vshl.u8	q1,q1,#1
+	veor	q8,q8,q9
+	veor	q3,q3,q10
+	veor	q8,q8,q10
+	vst1.32	{q3},[r2]!
+	bne	.Loop192
+
+	mov	r12,#12
+	add	r2,r2,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	vld1.8	{q8},[r0]
+	mov	r1,#7
+	mov	r12,#14
+	vst1.32	{q3},[r2]!
+
+.Loop256:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q8},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]!
+	beq	.Ldone
+
+	vdup.32	q10,d7[1]
+	vext.8	q9,q0,q8,#12
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+
+	veor	q8,q8,q10
+	b	.Loop256
+
+.Ldone:
+	str	r12,[r2]
+	mov	r3,#0
+
+.Lenc_key_abort:
+	mov	r0,r3			@ return value
+
+	bx	lr
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,%function
+.align	5
+aes_hw_set_decrypt_key:
+	stmdb	sp!,{r4,lr}
+	bl	.Lenc_key
+
+	cmp	r0,#0
+	bne	.Ldec_key_abort
+
+	sub	r2,r2,#240		@ restore original r2
+	mov	r4,#-16
+	add	r0,r2,r12,lsl#4	@ end of key schedule
+
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+
+.Loop_imc:
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+	cmp	r0,r2
+	bhi	.Loop_imc
+
+	vld1.32	{q0},[r2]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+	vst1.32	{q0},[r0]
+
+	eor	r0,r0,r0		@ return value
+.Ldec_key_abort:
+	ldmia	sp!,{r4,pc}
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,%function
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+.Loop_enc:
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	.Loop_enc
+
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,%function
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+.Loop_dec:
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	.Loop_dec
+
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+.size	aes_hw_decrypt,.-aes_hw_decrypt
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,%function
+.align	5
+aes_hw_cbc_encrypt:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load remaining args
+	subs	r2,r2,#16
+	mov	r8,#16
+	blo	.Lcbc_abort
+	moveq	r8,#0
+
+	cmp	r5,#0			@ en- or decrypting?
+	ldr	r5,[r3,#240]
+	and	r2,r2,#-16
+	vld1.8	{q6},[r4]
+	vld1.8	{q0},[r0],r8
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#6
+	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
+	sub	r5,r5,#2
+	vld1.32	{q10,q11},[r7]!
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+
+	add	r7,r3,#32
+	mov	r6,r5
+	beq	.Lcbc_dec
+
+	cmp	r5,#2
+	veor	q0,q0,q6
+	veor	q5,q8,q7
+	beq	.Lcbc_enc128
+
+	vld1.32	{q2,q3},[r7]
+	add	r7,r3,#16
+	add	r6,r3,#16*4
+	add	r12,r3,#16*5
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	add	r14,r3,#16*6
+	add	r3,r3,#16*7
+	b	.Lenter_cbc_enc
+
+.align	4
+.Loop_cbc_enc:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+.Lenter_cbc_enc:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r6]
+	cmp	r5,#4
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r12]
+	beq	.Lcbc_enc192
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r14]
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r3]
+	nop
+
+.Lcbc_enc192:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	.Loop_cbc_enc
+
+	vst1.8	{q6},[r1]!
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	vld1.32	{q2,q3},[r7]
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+.Lenter_cbc_enc128:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	.Loop_cbc_enc128
+
+	vst1.8	{q6},[r1]!
+	b	.Lcbc_done
+.align	5
+.Lcbc_dec:
+	vld1.8	{q10},[r0]!
+	subs	r2,r2,#32		@ bias
+	add	r6,r5,#2
+	vorr	q3,q0,q0
+	vorr	q1,q0,q0
+	vorr	q11,q10,q10
+	blo	.Lcbc_dec_tail
+
+	vorr	q1,q10,q10
+	vld1.8	{q10},[r0]!
+	vorr	q2,q0,q0
+	vorr	q3,q1,q1
+	vorr	q11,q10,q10
+
+.Loop3x_cbc_dec:
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Loop3x_cbc_dec
+
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q4,q6,q7
+	subs	r2,r2,#0x30
+	veor	q5,q2,q7
+	movlo	r6,r2			@ r6, r6, is zero at this point
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+	add	r0,r0,r6		@ r0 is adjusted in such way that
+					@ at exit from the loop q1-q10
+					@ are loaded with last "words"
+	vorr	q6,q11,q11
+	mov	r7,r3
+.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q2},[r0]!
+.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q3},[r0]!
+.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q11},[r0]!
+.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	add	r6,r5,#2
+	veor	q4,q4,q0
+	veor	q5,q5,q1
+	veor	q10,q10,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q4},[r1]!
+	vorr	q0,q2,q2
+	vst1.8	{q5},[r1]!
+	vorr	q1,q3,q3
+	vst1.8	{q10},[r1]!
+	vorr	q10,q11,q11
+	bhs	.Loop3x_cbc_dec
+
+	cmn	r2,#0x30
+	beq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Lcbc_dec_tail
+
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	cmn	r2,#0x20
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q5,q6,q7
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	beq	.Lcbc_dec_one
+	veor	q5,q5,q1
+	veor	q9,q9,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+	vst1.8	{q9},[r1]!
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	veor	q5,q5,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+
+.Lcbc_done:
+	vst1.8	{q6},[r4]
+.Lcbc_abort:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,%function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldr	r4, [ip]		@ load remaining arg
+	ldr	r5,[r3,#240]
+
+	ldr	r8, [r4, #12]
+	vld1.32	{q0},[r4]
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#4
+	mov	r12,#16
+	cmp	r2,#2
+	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
+	sub	r5,r5,#2
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+	add	r7,r3,#32
+	mov	r6,r5
+	movlo	r12,#0
+
+	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	@ affected by silicon errata #1742098 [0] and #1655431 [1],
+	@ respectively, where the second instruction of an aese/aesmc
+	@ instruction pair may execute twice if an interrupt is taken right
+	@ after the first instruction consumes an input register of which a
+	@ single 32-bit lane has been updated the last time it was modified.
+	@ 
+	@ This function uses a counter in one 32-bit lane. The 
+	@ could write to q1 and q10 directly, but that trips this bugs.
+	@ We write to q6 and copy to the final register as a workaround.
+	@ 
+	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev	r8, r8
+#endif
+	add	r10, r8, #1
+	vorr	q6,q0,q0
+	rev	r10, r10
+	vmov.32	d13[1],r10
+	add	r8, r8, #2
+	vorr	q1,q6,q6
+	bls	.Lctr32_tail
+	rev	r12, r8
+	vmov.32	d13[1],r12
+	sub	r2,r2,#3		@ bias
+	vorr	q10,q6,q6
+	b	.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Loop3x_ctr32
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
+	vld1.8	{q2},[r0]!
+	add	r9,r8,#1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.8	{q3},[r0]!
+	rev	r9,r9
+.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vld1.8	{q11},[r0]!
+	mov	r7,r3
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
+.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	veor	q2,q2,q7
+	add	r10,r8,#2
+.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	veor	q3,q3,q7
+	add	r8,r8,#3
+.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	 @ Note the logic to update q0, q1, and q1 is written to work
+	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 @ 32-bit mode. See the comment above.
+	veor	q11,q11,q7
+	vmov.32	d13[1], r9
+.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q0,q6,q6
+	rev	r10,r10
+.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+	vmov.32	d13[1], r10
+	rev	r12,r8
+.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vorr	q1,q6,q6
+	vmov.32	d13[1], r12
+.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q10,q6,q6
+	subs	r2,r2,#3
+.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
+.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
+.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
+
+	veor	q2,q2,q4
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	vst1.8	{q2},[r1]!
+	veor	q3,q3,q5
+	mov	r6,r5
+	vst1.8	{q3},[r1]!
+	veor	q11,q11,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q11},[r1]!
+	bhs	.Loop3x_ctr32
+
+	adds	r2,r2,#3
+	beq	.Lctr32_done
+	cmp	r2,#1
+	mov	r12,#16
+	moveq	r12,#0
+
+.Lctr32_tail:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q9},[r7]!
+	bgt	.Lctr32_tail
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q2},[r0],r12
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q3},[r0]
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q2,q2,q7
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q3,q3,q7
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
+
+	cmp	r2,#1
+	veor	q2,q2,q0
+	veor	q3,q3,q1
+	vst1.8	{q2},[r1]!
+	beq	.Lctr32_done
+	vst1.8	{q3},[r1]
+
+.Lctr32_done:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-armv8-apple.S b/gen/bcm/aesv8-armv8-apple.S
new file mode 100644
index 0000000..144c4af
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-apple.S
@@ -0,0 +1,791 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.section	__TEXT,__const
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	Lenc_key_abort
+	cmp	x2,#0
+	b.eq	Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon@PAGE
+	add	x3,x3,Lrcon@PAGEOFF
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+
+.align	5
+_aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+
+.align	5
+_aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+
+.align	5
+_aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+
+.align	5
+_aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/aesv8-armv8-linux.S b/gen/bcm/aesv8-armv8-linux.S
new file mode 100644
index 0000000..7d4bcb4
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-linux.S
@@ -0,0 +1,791 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,%function
+.align	5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	.Lenc_key_abort
+	cmp	x2,#0
+	b.eq	.Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	.Lenc_key_abort
+	cmp	w1,#256
+	b.gt	.Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adrp	x3,.Lrcon
+	add	x3,x3,:lo12:.Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	.Loop128
+	b.eq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	.Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+.Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	.Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+.Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	.Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	.Loop256
+
+.Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+.Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,%function
+.align	5
+aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	.Lenc_key
+
+	cmp	x0,#0
+	b.ne	.Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+.Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	.Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+.Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,%function
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,%function
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_hw_decrypt,.-aes_hw_decrypt
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,%function
+.align	5
+aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	.Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	.Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	.Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	.Lenter_cbc_enc
+
+.align	4
+.Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	.Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+.Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+.align	5
+.Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	.Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	.Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	.Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+.Lcbc_done:
+	st1	{v6.16b},[x4]
+.Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,%function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	.Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	.Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	.Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+.Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	.Lctr32_done
+	st1	{v3.16b},[x1]
+
+.Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-armv8-win.S b/gen/bcm/aesv8-armv8-win.S
new file mode 100644
index 0000000..a3ab33a
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-win.S
@@ -0,0 +1,803 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+
+.def aes_hw_set_encrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	Lenc_key_abort
+	cmp	x2,#0
+	b.eq	Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon
+	add	x3,x3,:lo12:Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+
+.globl	aes_hw_set_decrypt_key
+
+.def aes_hw_set_decrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	aes_hw_encrypt
+
+.def aes_hw_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	aes_hw_decrypt
+
+.def aes_hw_decrypt
+   .type 32
+.endef
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	aes_hw_cbc_encrypt
+
+.def aes_hw_cbc_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+
+.globl	aes_hw_ctr32_encrypt_blocks
+
+.def aes_hw_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/aesv8-gcm-armv8-apple.S b/gen/bcm/aesv8-gcm-armv8-apple.S
new file mode 100644
index 0000000..13be797
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-apple.S
@@ -0,0 +1,1555 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+
+.text
+.globl	_aes_gcm_enc_kernel
+.private_extern	_aes_gcm_enc_kernel
+
+.align	4
+_aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	Lenc_prepretail                                  // do prepretail
+
+Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	Lenc_main_loop
+
+Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_aes_gcm_dec_kernel
+.private_extern	_aes_gcm_dec_kernel
+
+.align	4
+_aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/aesv8-gcm-armv8-linux.S b/gen/bcm/aesv8-gcm-armv8-linux.S
new file mode 100644
index 0000000..4283f93
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-linux.S
@@ -0,0 +1,1555 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_kernel
+.hidden	aes_gcm_enc_kernel
+.type	aes_gcm_enc_kernel,%function
+.align	4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+.Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	.Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	.Lenc_prepretail                                  // do prepretail
+
+.Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // .LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	.Lenc_main_loop
+
+.Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+.Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+.Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	.Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	.Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	.Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.Lenc_blocks_less_than_1
+.Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+.Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+.Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+.Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
+.globl	aes_gcm_dec_kernel
+.hidden	aes_gcm_dec_kernel
+.type	aes_gcm_dec_kernel,%function
+.align	4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+.Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	.Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	.Ldec_prepretail                                  // do prepretail
+
+.Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // .LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	.Ldec_main_loop
+
+.Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+.Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+.Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	.Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	.Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	.Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.Ldec_blocks_less_than_1
+.Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+.Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+.Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+.Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-gcm-armv8-win.S b/gen/bcm/aesv8-gcm-armv8-win.S
new file mode 100644
index 0000000..1233796
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-win.S
@@ -0,0 +1,1559 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_kernel
+
+.def aes_gcm_enc_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	Lenc_prepretail                                  // do prepretail
+
+Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	Lenc_main_loop
+
+Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	aes_gcm_dec_kernel
+
+.def aes_gcm_dec_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/armv4-mont-linux.S b/gen/bcm/armv4-mont-linux.S
new file mode 100644
index 0000000..0b845b6
--- /dev/null
+++ b/gen/bcm/armv4-mont-linux.S
@@ -0,0 +1,939 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.globl	bn_mul_mont_nohw
+.hidden	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+
+.align	5
+bn_mul_mont_nohw:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	cmp	ip,#2
+	mov	r0,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
+
+	mov	r0,r0,lsl#2		@ rescale r0 for byte count
+	sub	sp,sp,r0		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	r0,r0,#4		@ "num=num-1"
+	add	r4,r2,r0		@ &bp[num-1]
+
+	add	r0,sp,r0		@ r0 to point at &tp[num-1]
+	ldr	r8,[r0,#14*4]		@ &n0
+	ldr	r2,[r2]		@ bp[0]
+	ldr	r5,[r1],#4		@ ap[0],ap++
+	ldr	r6,[r3],#4		@ np[0],np++
+	ldr	r8,[r8]		@ *n0
+	str	r4,[r0,#15*4]		@ save &bp[num]
+
+	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
+	str	r8,[r0,#14*4]		@ save n0 value
+	mul	r8,r10,r8		@ "tp[0]"*n0
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
+	mov	r4,sp
+
+.L1st:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.L1st
+
+	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
+	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	mov	r7,sp
+	str	r14,[r0,#4]		@ tp[num]=
+
+.Louter:
+	sub	r7,r0,r7		@ "original" r0-1 value
+	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
+	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
+	ldr	r5,[r1,#-4]		@ ap[0]
+	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
+	ldr	r7,[sp,#4]		@ tp[1]
+
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
+	str	r4,[r0,#13*4]		@ save bp
+	mul	r8,r10,r8
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
+	mov	r4,sp
+
+.Linner:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.Linner
+
+	adds	r12,r12,r11
+	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
+	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	str	r14,[r0,#4]		@ tp[num]=
+
+	cmp	r4,r7
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	r7,sp
+	bne	.Louter
+
+	ldr	r2,[r0,#12*4]		@ pull rp
+	mov	r5,sp
+	add	r0,r0,#4		@ r0 to point at &tp[num]
+	sub	r5,r0,r5		@ "original" num value
+	mov	r4,sp			@ "rewind" r4
+	mov	r1,r4			@ "borrow" r1
+	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
+
+	subs	r7,r7,r7		@ "clear" carry flag
+.Lsub:	ldr	r7,[r4],#4
+	ldr	r6,[r3],#4
+	sbcs	r7,r7,r6		@ tp[j]-np[j]
+	str	r7,[r2],#4		@ rp[j]=
+	teq	r4,r0		@ preserve carry
+	bne	.Lsub
+	sbcs	r14,r14,#0		@ upmost carry
+	mov	r4,sp			@ "rewind" r4
+	sub	r2,r2,r5		@ "rewind" r2
+
+.Lcopy:	ldr	r7,[r4]		@ conditional copy
+	ldr	r5,[r2]
+	str	sp,[r4],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	r5,r7
+	str	r5,[r2],#4
+	teq	r4,r0		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,r0
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+	bx	lr				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	bn_mul8x_mont_neon
+.hidden	bn_mul8x_mont_neon
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	r5,#8
+	bhi	.LNEON_8n
+
+	@ special case for r5==8, everything is in register bank...
+
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	sub	r7,sp,r5,lsl#4
+	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
+	and	r7,r7,#-64
+	vld1.32	{d30[0]}, [r4,:32]
+	mov	sp,r7			@ alloca
+	vzip.16	d28,d8
+
+	vmull.u32	q6,d28,d0[0]
+	vmull.u32	q7,d28,d0[1]
+	vmull.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmull.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	vmul.u32	d29,d29,d30
+
+	vmull.u32	q10,d28,d2[0]
+	vld1.32	{d4,d5,d6,d7}, [r3]!
+	vmull.u32	q11,d28,d2[1]
+	vmull.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmull.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	sub	r9,r5,#1
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	vadd.u64	d12,d12,d10
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	subs	r9,r9,#1
+	vmul.u32	d29,d29,d30
+
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	d12,d12,d10
+	mov	r7,sp
+	vshr.u64	d10,d12,#16
+	mov	r8,r5
+	vadd.u64	d13,d13,d10
+	add	r6,sp,#96
+	vshr.u64	d10,d13,#16
+	vzip.16	d12,d13
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor	q6,q6,q6
+	sub	r7,sp,#128
+	veor	q7,q7,q7
+	sub	r7,r7,r5,lsl#4
+	veor	q8,q8,q8
+	and	r7,r7,#-64
+	veor	q9,q9,q9
+	mov	sp,r7			@ alloca
+	veor	q10,q10,q10
+	add	r7,r7,#256
+	veor	q11,q11,q11
+	sub	r8,r5,#8
+	veor	q12,q12,q12
+	veor	q13,q13,q13
+
+.LNEON_8n_init:
+	vst1.64	{q6,q7},[r7,:256]!
+	subs	r8,r8,#8
+	vst1.64	{q8,q9},[r7,:256]!
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12,q13},[r7,:256]!
+	bne	.LNEON_8n_init
+
+	add	r6,sp,#256
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	add	r10,sp,#8
+	vld1.32	{d30[0]},[r4,:32]
+	mov	r9,r5
+	b	.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	add	r7,sp,#128
+	vld1.32	{d4,d5,d6,d7},[r3]!
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+	vadd.u64	d29,d29,d12
+	vmlal.u32	q10,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q11,d28,d2[1]
+	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q6,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q7,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q8,d29,d5[0]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vadd.u64	d12,d12,d13
+	vmlal.u32	q11,d29,d6[1]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vadd.u64	d14,d14,d12
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]!
+	vmlal.u32	q8,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q9,d28,d1[0]
+	vshl.i64	d29,d15,#16
+	vmlal.u32	q10,d28,d1[1]
+	vadd.u64	d29,d29,d14
+	vmlal.u32	q11,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q12,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
+	vmlal.u32	q13,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q7,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q8,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q9,d29,d5[0]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vadd.u64	d14,d14,d15
+	vmlal.u32	q12,d29,d6[1]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vadd.u64	d16,d16,d14
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]!
+	vmlal.u32	q9,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q10,d28,d1[0]
+	vshl.i64	d29,d17,#16
+	vmlal.u32	q11,d28,d1[1]
+	vadd.u64	d29,d29,d16
+	vmlal.u32	q12,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q13,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
+	vmlal.u32	q6,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q8,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q9,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q10,d29,d5[0]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vadd.u64	d16,d16,d17
+	vmlal.u32	q13,d29,d6[1]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vadd.u64	d18,d18,d16
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]!
+	vmlal.u32	q10,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q11,d28,d1[0]
+	vshl.i64	d29,d19,#16
+	vmlal.u32	q12,d28,d1[1]
+	vadd.u64	d29,d29,d18
+	vmlal.u32	q13,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q6,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
+	vmlal.u32	q7,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q9,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q10,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q11,d29,d5[0]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vadd.u64	d18,d18,d19
+	vmlal.u32	q6,d29,d6[1]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vadd.u64	d20,d20,d18
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]!
+	vmlal.u32	q11,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q12,d28,d1[0]
+	vshl.i64	d29,d21,#16
+	vmlal.u32	q13,d28,d1[1]
+	vadd.u64	d29,d29,d20
+	vmlal.u32	q6,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q7,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
+	vmlal.u32	q8,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q10,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q11,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q12,d29,d5[0]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vadd.u64	d20,d20,d21
+	vmlal.u32	q7,d29,d6[1]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vadd.u64	d22,d22,d20
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]!
+	vmlal.u32	q12,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q13,d28,d1[0]
+	vshl.i64	d29,d23,#16
+	vmlal.u32	q6,d28,d1[1]
+	vadd.u64	d29,d29,d22
+	vmlal.u32	q7,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q8,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
+	vmlal.u32	q9,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q11,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q12,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q13,d29,d5[0]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vadd.u64	d22,d22,d23
+	vmlal.u32	q8,d29,d6[1]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vadd.u64	d24,d24,d22
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]!
+	vmlal.u32	q13,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q6,d28,d1[0]
+	vshl.i64	d29,d25,#16
+	vmlal.u32	q7,d28,d1[1]
+	vadd.u64	d29,d29,d24
+	vmlal.u32	q8,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q9,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
+	vmlal.u32	q10,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q12,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q13,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q6,d29,d5[0]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vadd.u64	d24,d24,d25
+	vmlal.u32	q9,d29,d6[1]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vadd.u64	d26,d26,d24
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]!
+	vmlal.u32	q6,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q7,d28,d1[0]
+	vshl.i64	d29,d27,#16
+	vmlal.u32	q8,d28,d1[1]
+	vadd.u64	d29,d29,d26
+	vmlal.u32	q9,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q10,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
+	vmlal.u32	q11,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q12,d28,d3[1]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d29,d5[0]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vadd.u64	d26,d26,d27
+	vmlal.u32	q10,d29,d6[1]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q11,d29,d7[0]
+	vmlal.u32	q12,d29,d7[1]
+	vadd.u64	d12,d12,d26
+	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
+	add	r10,sp,#8		@ rewind
+	sub	r8,r5,#8
+	b	.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs	r8,r8,#8
+	vmlal.u32	q6,d28,d0[0]
+	vld1.64	{q13},[r6,:128]
+	vmlal.u32	q7,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	q8,d28,d1[0]
+	vld1.32	{d4,d5,d6,d7},[r3]!
+	vmlal.u32	q9,d28,d1[1]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vmlal.u32	q11,d29,d6[1]
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vst1.64	{q6},[r7,:128]!
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]
+	vmlal.u32	q8,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
+	vmlal.u32	q9,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d1[1]
+	vmlal.u32	q11,d28,d2[0]
+	vmlal.u32	q12,d28,d2[1]
+	vmlal.u32	q13,d28,d3[0]
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
+	vmlal.u32	q7,d29,d4[0]
+	vmlal.u32	q8,d29,d4[1]
+	vmlal.u32	q9,d29,d5[0]
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vmlal.u32	q12,d29,d6[1]
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vst1.64	{q7},[r7,:128]!
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]
+	vmlal.u32	q9,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
+	vmlal.u32	q10,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q11,d28,d1[1]
+	vmlal.u32	q12,d28,d2[0]
+	vmlal.u32	q13,d28,d2[1]
+	vmlal.u32	q6,d28,d3[0]
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
+	vmlal.u32	q8,d29,d4[0]
+	vmlal.u32	q9,d29,d4[1]
+	vmlal.u32	q10,d29,d5[0]
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vmlal.u32	q13,d29,d6[1]
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vst1.64	{q8},[r7,:128]!
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]
+	vmlal.u32	q10,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
+	vmlal.u32	q11,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q12,d28,d1[1]
+	vmlal.u32	q13,d28,d2[0]
+	vmlal.u32	q6,d28,d2[1]
+	vmlal.u32	q7,d28,d3[0]
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
+	vmlal.u32	q9,d29,d4[0]
+	vmlal.u32	q10,d29,d4[1]
+	vmlal.u32	q11,d29,d5[0]
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vmlal.u32	q6,d29,d6[1]
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vst1.64	{q9},[r7,:128]!
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]
+	vmlal.u32	q11,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
+	vmlal.u32	q12,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q13,d28,d1[1]
+	vmlal.u32	q6,d28,d2[0]
+	vmlal.u32	q7,d28,d2[1]
+	vmlal.u32	q8,d28,d3[0]
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
+	vmlal.u32	q10,d29,d4[0]
+	vmlal.u32	q11,d29,d4[1]
+	vmlal.u32	q12,d29,d5[0]
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vmlal.u32	q7,d29,d6[1]
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vst1.64	{q10},[r7,:128]!
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]
+	vmlal.u32	q12,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
+	vmlal.u32	q13,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q6,d28,d1[1]
+	vmlal.u32	q7,d28,d2[0]
+	vmlal.u32	q8,d28,d2[1]
+	vmlal.u32	q9,d28,d3[0]
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
+	vmlal.u32	q11,d29,d4[0]
+	vmlal.u32	q12,d29,d4[1]
+	vmlal.u32	q13,d29,d5[0]
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vmlal.u32	q8,d29,d6[1]
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vst1.64	{q11},[r7,:128]!
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]
+	vmlal.u32	q13,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
+	vmlal.u32	q6,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q7,d28,d1[1]
+	vmlal.u32	q8,d28,d2[0]
+	vmlal.u32	q9,d28,d2[1]
+	vmlal.u32	q10,d28,d3[0]
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
+	vmlal.u32	q12,d29,d4[0]
+	vmlal.u32	q13,d29,d4[1]
+	vmlal.u32	q6,d29,d5[0]
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vmlal.u32	q9,d29,d6[1]
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vst1.64	{q12},[r7,:128]!
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]
+	vmlal.u32	q6,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
+	vmlal.u32	q7,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q8,d28,d1[1]
+	vmlal.u32	q9,d28,d2[0]
+	vmlal.u32	q10,d28,d2[1]
+	vmlal.u32	q11,d28,d3[0]
+	vmlal.u32	q12,d28,d3[1]
+	it	eq
+	subeq	r1,r1,r5,lsl#2	@ rewind
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q6,d29,d4[1]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q7,d29,d5[0]
+	add	r10,sp,#8		@ rewind
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vmlal.u32	q10,d29,d6[1]
+	vmlal.u32	q11,d29,d7[0]
+	vst1.64	{q13},[r7,:128]!
+	vmlal.u32	q12,d29,d7[1]
+
+	bne	.LNEON_8n_inner
+	add	r6,sp,#128
+	vst1.64	{q6,q7},[r7,:256]!
+	veor	q2,q2,q2		@ d4-d5
+	vst1.64	{q8,q9},[r7,:256]!
+	veor	q3,q3,q3		@ d6-d7
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12},[r7,:128]
+
+	subs	r9,r9,#8
+	vld1.64	{q6,q7},[r6,:256]!
+	vld1.64	{q8,q9},[r6,:256]!
+	vld1.64	{q10,q11},[r6,:256]!
+	vld1.64	{q12,q13},[r6,:256]!
+
+	itt	ne
+	subne	r3,r3,r5,lsl#2	@ rewind
+	bne	.LNEON_8n_outer
+
+	add	r7,sp,#128
+	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	d10,d12,#16
+	vst1.64	{q2,q3},[sp,:256]!
+	vadd.u64	d13,d13,d10
+	vst1.64	{q2,q3}, [sp,:256]!
+	vshr.u64	d10,d13,#16
+	vst1.64	{q2,q3}, [sp,:256]!
+	vzip.16	d12,d13
+
+	mov	r8,r5
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	d12,d12,d10
+	vshr.u64	d10,d12,#16
+	vld1.64	{q8,q9}, [r6, :256]!
+	vadd.u64	d13,d13,d10
+	vld1.64	{q10,q11}, [r6, :256]!
+	vshr.u64	d10,d13,#16
+	vld1.64	{q12,q13}, [r6, :256]!
+	vzip.16	d12,d13
+
+.LNEON_tail_entry:
+	vadd.u64	d14,d14,d10
+	vst1.32	{d12[0]}, [r7, :32]!
+	vshr.u64	d10,d14,#16
+	vadd.u64	d15,d15,d10
+	vshr.u64	d10,d15,#16
+	vzip.16	d14,d15
+	vadd.u64	d16,d16,d10
+	vst1.32	{d14[0]}, [r7, :32]!
+	vshr.u64	d10,d16,#16
+	vadd.u64	d17,d17,d10
+	vshr.u64	d10,d17,#16
+	vzip.16	d16,d17
+	vadd.u64	d18,d18,d10
+	vst1.32	{d16[0]}, [r7, :32]!
+	vshr.u64	d10,d18,#16
+	vadd.u64	d19,d19,d10
+	vshr.u64	d10,d19,#16
+	vzip.16	d18,d19
+	vadd.u64	d20,d20,d10
+	vst1.32	{d18[0]}, [r7, :32]!
+	vshr.u64	d10,d20,#16
+	vadd.u64	d21,d21,d10
+	vshr.u64	d10,d21,#16
+	vzip.16	d20,d21
+	vadd.u64	d22,d22,d10
+	vst1.32	{d20[0]}, [r7, :32]!
+	vshr.u64	d10,d22,#16
+	vadd.u64	d23,d23,d10
+	vshr.u64	d10,d23,#16
+	vzip.16	d22,d23
+	vadd.u64	d24,d24,d10
+	vst1.32	{d22[0]}, [r7, :32]!
+	vshr.u64	d10,d24,#16
+	vadd.u64	d25,d25,d10
+	vshr.u64	d10,d25,#16
+	vzip.16	d24,d25
+	vadd.u64	d26,d26,d10
+	vst1.32	{d24[0]}, [r7, :32]!
+	vshr.u64	d10,d26,#16
+	vadd.u64	d27,d27,d10
+	vshr.u64	d10,d27,#16
+	vzip.16	d26,d27
+	vld1.64	{q6,q7}, [r6, :256]!
+	subs	r8,r8,#8
+	vst1.32	{d26[0]},   [r7, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
+	sub	r3,r3,r5,lsl#2			@ rewind r3
+	subs	r1,sp,#0				@ clear carry flag
+	add	r2,sp,r5,lsl#2
+
+.LNEON_sub:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r3!, {r8,r9,r10,r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [r1]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,r2,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	r1,sp
+	sub	r0,r0,r11				@ rewind r0
+	mov	r3,r2				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r0,  {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	r1, {r4,r5,r6,r7}
+	stmia	r0!, {r8,r9,r10,r11}
+	sub	r1,r1,#16
+	ldmia	r0, {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	bx	lr						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/armv8-mont-apple.S b/gen/bcm/armv8-mont-apple.S
new file mode 100644
index 0000000..cf798a3
--- /dev/null
+++ b/gen/bcm/armv8-mont-apple.S
@@ -0,0 +1,1425 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+
+.align	5
+_bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/armv8-mont-linux.S b/gen/bcm/armv8-mont-linux.S
new file mode 100644
index 0000000..13f045c
--- /dev/null
+++ b/gen/bcm/armv8-mont-linux.S
@@ -0,0 +1,1425 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,%function
+.align	5
+bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+.Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,.L1st_skip
+
+.L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,.L1st
+
+.L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+.Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,.Linner_skip
+
+.Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,.Linner
+
+.Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,.Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+.Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,.Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+.Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,.Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+.type	__bn_sqr8x_mont,%function
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+.Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	.Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,.Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,.Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+.Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	.Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_mul
+
+.align	4
+.Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,.Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	.Lsqr8x_outer_loop
+
+.align	4
+.Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,.Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+.Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,.Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,.Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+.Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,.Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_tail
+
+.align	4
+.Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	.Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+.Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,.Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+.Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,.Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	.Lsqr8x_done
+
+.align	4
+.Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
+.type	__bn_mul4x_mont,%function
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+.Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_reduction
+
+	cbz	x10,.Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,.Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_1st_tail
+
+.align	5
+.Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+.Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+.Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,.Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_tail
+
+.align	4
+.Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	.Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	.Loop_mul4x_reduction
+
+.align	4
+.Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+.Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,.Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+.Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,.Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	.Lmul4x_done
+
+.align	4
+.Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	__bn_mul4x_mont,.-__bn_mul4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/armv8-mont-win.S b/gen/bcm/armv8-mont-win.S
new file mode 100644
index 0000000..dcce02c
--- /dev/null
+++ b/gen/bcm/armv8-mont-win.S
@@ -0,0 +1,1431 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	bn_mul_mont
+
+.def bn_mul_mont
+   .type 32
+.endef
+.align	5
+bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def __bn_sqr8x_mont
+   .type 32
+.endef
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def __bn_mul4x_mont
+   .type 32
+.endef
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/bn-586-apple.S b/gen/bcm/bn-586-apple.S
new file mode 100644
index 0000000..93513d0
--- /dev/null
+++ b/gen/bcm/bn-586-apple.S
@@ -0,0 +1,987 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_add_words
+.private_extern	_bn_mul_add_words
+.align	4
+_bn_mul_add_words:
+L_bn_mul_add_words_begin:
+	call	L000PIC_me_up
+L000PIC_me_up:
+	popl	%eax
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	L001maw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	L002maw_sse2_entry
+.align	4,0x90
+L003maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	L004maw_sse2_exit
+L002maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	L003maw_sse2_unrolled
+.align	2,0x90
+L005maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	L005maw_sse2_loop
+L004maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+.align	4,0x90
+L001maw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	28(%esp),%ecx
+	movl	24(%esp),%ebx
+	andl	$4294967288,%ecx
+	movl	32(%esp),%ebp
+	pushl	%ecx
+	jz	L006maw_finish
+.align	4,0x90
+L007maw_loop:
+	# Round 0 
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	# Round 4 
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	# Round 8 
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	# Round 12 
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	# Round 16 
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	# Round 20 
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	# Round 24 
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+	# Round 28 
+	movl	28(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	28(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	subl	$8,%ecx
+	leal	32(%ebx),%ebx
+	leal	32(%edi),%edi
+	jnz	L007maw_loop
+L006maw_finish:
+	movl	32(%esp),%ecx
+	andl	$7,%ecx
+	jnz	L008maw_finish2
+	jmp	L009maw_end
+L008maw_finish2:
+	# Tail Round 0 
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 1 
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 2 
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 3 
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 4 
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 5 
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	jz	L009maw_end
+	# Tail Round 6 
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+L009maw_end:
+	movl	%esi,%eax
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_mul_words
+.private_extern	_bn_mul_words
+.align	4
+_bn_mul_words:
+L_bn_mul_words_begin:
+	call	L010PIC_me_up
+L010PIC_me_up:
+	popl	%eax
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	L011mw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	4,0x90
+L012mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	L012mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+.align	4,0x90
+L011mw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ebp
+	movl	32(%esp),%ecx
+	andl	$4294967288,%ebp
+	jz	L013mw_finish
+L014mw_loop:
+	# Round 0 
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	# Round 4 
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	# Round 8 
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	# Round 12 
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	# Round 16 
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	# Round 20 
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	# Round 24 
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+	# Round 28 
+	movl	28(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	addl	$32,%ebx
+	addl	$32,%edi
+	subl	$8,%ebp
+	jz	L013mw_finish
+	jmp	L014mw_loop
+L013mw_finish:
+	movl	28(%esp),%ebp
+	andl	$7,%ebp
+	jnz	L015mw_finish2
+	jmp	L016mw_end
+L015mw_finish2:
+	# Tail Round 0 
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 1 
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 2 
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 3 
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 4 
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 5 
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	L016mw_end
+	# Tail Round 6 
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+L016mw_end:
+	movl	%esi,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_sqr_words
+.private_extern	_bn_sqr_words
+.align	4
+_bn_sqr_words:
+L_bn_sqr_words_begin:
+	call	L017PIC_me_up
+L017PIC_me_up:
+	popl	%eax
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	L018sqr_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	4,0x90
+L019sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	L019sqr_sse2_loop
+	emms
+	ret
+.align	4,0x90
+L018sqr_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%ebx
+	andl	$4294967288,%ebx
+	jz	L020sw_finish
+L021sw_loop:
+	# Round 0 
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	movl	%edx,4(%esi)
+	# Round 4 
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	movl	%edx,12(%esi)
+	# Round 8 
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	movl	%edx,20(%esi)
+	# Round 12 
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	movl	%edx,28(%esi)
+	# Round 16 
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	movl	%edx,36(%esi)
+	# Round 20 
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	movl	%edx,44(%esi)
+	# Round 24 
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+	# Round 28 
+	movl	28(%edi),%eax
+	mull	%eax
+	movl	%eax,56(%esi)
+	movl	%edx,60(%esi)
+
+	addl	$32,%edi
+	addl	$64,%esi
+	subl	$8,%ebx
+	jnz	L021sw_loop
+L020sw_finish:
+	movl	28(%esp),%ebx
+	andl	$7,%ebx
+	jz	L022sw_end
+	# Tail Round 0 
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	decl	%ebx
+	movl	%edx,4(%esi)
+	jz	L022sw_end
+	# Tail Round 1 
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	decl	%ebx
+	movl	%edx,12(%esi)
+	jz	L022sw_end
+	# Tail Round 2 
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	decl	%ebx
+	movl	%edx,20(%esi)
+	jz	L022sw_end
+	# Tail Round 3 
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	decl	%ebx
+	movl	%edx,28(%esi)
+	jz	L022sw_end
+	# Tail Round 4 
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	decl	%ebx
+	movl	%edx,36(%esi)
+	jz	L022sw_end
+	# Tail Round 5 
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	decl	%ebx
+	movl	%edx,44(%esi)
+	jz	L022sw_end
+	# Tail Round 6 
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+L022sw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_div_words
+.private_extern	_bn_div_words
+.align	4
+_bn_div_words:
+L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.globl	_bn_add_words
+.private_extern	_bn_add_words
+.align	4
+_bn_add_words:
+L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	L023aw_finish
+L024aw_loop:
+	# Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	# Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+	# Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+	# Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+	# Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+	# Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+	# Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	# Round 7 
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	L024aw_loop
+L023aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	L025aw_end
+	# Tail Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	L025aw_end
+	# Tail Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	L025aw_end
+	# Tail Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	L025aw_end
+	# Tail Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	L025aw_end
+	# Tail Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	L025aw_end
+	# Tail Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	L025aw_end
+	# Tail Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+L025aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_sub_words
+.private_extern	_bn_sub_words
+.align	4
+_bn_sub_words:
+L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	L026aw_finish
+L027aw_loop:
+	# Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	# Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+	# Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+	# Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+	# Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+	# Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+	# Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	# Round 7 
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	L027aw_loop
+L026aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	L028aw_end
+	# Tail Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	L028aw_end
+	# Tail Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	L028aw_end
+	# Tail Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	L028aw_end
+	# Tail Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	L028aw_end
+	# Tail Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	L028aw_end
+	# Tail Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	L028aw_end
+	# Tail Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+L028aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol	_OPENSSL_ia32cap_P
+.long	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/bn-586-linux.S b/gen/bcm/bn-586-linux.S
new file mode 100644
index 0000000..311f22c
--- /dev/null
+++ b/gen/bcm/bn-586-linux.S
@@ -0,0 +1,995 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	bn_mul_add_words
+.hidden	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L001maw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	.L002maw_sse2_entry
+.align	16
+.L003maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	.L004maw_sse2_exit
+.L002maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	.L003maw_sse2_unrolled
+.align	4
+.L005maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L005maw_sse2_loop
+.L004maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L001maw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	28(%esp),%ecx
+	movl	24(%esp),%ebx
+	andl	$4294967288,%ecx
+	movl	32(%esp),%ebp
+	pushl	%ecx
+	jz	.L006maw_finish
+.align	16
+.L007maw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	28(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	subl	$8,%ecx
+	leal	32(%ebx),%ebx
+	leal	32(%edi),%edi
+	jnz	.L007maw_loop
+.L006maw_finish:
+	movl	32(%esp),%ecx
+	andl	$7,%ecx
+	jnz	.L008maw_finish2
+	jmp	.L009maw_end
+.L008maw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L009maw_end:
+	movl	%esi,%eax
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl	bn_mul_words
+.hidden	bn_mul_words
+.type	bn_mul_words,@function
+.align	16
+bn_mul_words:
+.L_bn_mul_words_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L011mw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	16
+.L012mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L012mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L011mw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ebp
+	movl	32(%esp),%ecx
+	andl	$4294967288,%ebp
+	jz	.L013mw_finish
+.L014mw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	addl	$32,%ebx
+	addl	$32,%edi
+	subl	$8,%ebp
+	jz	.L013mw_finish
+	jmp	.L014mw_loop
+.L013mw_finish:
+	movl	28(%esp),%ebp
+	andl	$7,%ebp
+	jnz	.L015mw_finish2
+	jmp	.L016mw_end
+.L015mw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L016mw_end:
+	movl	%esi,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_words,.-.L_bn_mul_words_begin
+.globl	bn_sqr_words
+.hidden	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+	call	.L017PIC_me_up
+.L017PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L018sqr_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	16
+.L019sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	.L019sqr_sse2_loop
+	emms
+	ret
+.align	16
+.L018sqr_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%ebx
+	andl	$4294967288,%ebx
+	jz	.L020sw_finish
+.L021sw_loop:
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	movl	%edx,4(%esi)
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	movl	%edx,12(%esi)
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	movl	%edx,20(%esi)
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	movl	%edx,28(%esi)
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	movl	%edx,36(%esi)
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	movl	%edx,44(%esi)
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+
+	movl	28(%edi),%eax
+	mull	%eax
+	movl	%eax,56(%esi)
+	movl	%edx,60(%esi)
+
+	addl	$32,%edi
+	addl	$64,%esi
+	subl	$8,%ebx
+	jnz	.L021sw_loop
+.L020sw_finish:
+	movl	28(%esp),%ebx
+	andl	$7,%ebx
+	jz	.L022sw_end
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	decl	%ebx
+	movl	%edx,4(%esi)
+	jz	.L022sw_end
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	decl	%ebx
+	movl	%edx,12(%esi)
+	jz	.L022sw_end
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	decl	%ebx
+	movl	%edx,20(%esi)
+	jz	.L022sw_end
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	decl	%ebx
+	movl	%edx,28(%esi)
+	jz	.L022sw_end
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	decl	%ebx
+	movl	%edx,36(%esi)
+	jz	.L022sw_end
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	decl	%ebx
+	movl	%edx,44(%esi)
+	jz	.L022sw_end
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+.L022sw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl	bn_div_words
+.hidden	bn_div_words
+.type	bn_div_words,@function
+.align	16
+bn_div_words:
+.L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.size	bn_div_words,.-.L_bn_div_words_begin
+.globl	bn_add_words
+.hidden	bn_add_words
+.type	bn_add_words,@function
+.align	16
+bn_add_words:
+.L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L023aw_finish
+.L024aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L024aw_loop
+.L023aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L025aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L025aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L025aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L025aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L025aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L025aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L025aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L025aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_add_words,.-.L_bn_add_words_begin
+.globl	bn_sub_words
+.hidden	bn_sub_words
+.type	bn_sub_words,@function
+.align	16
+bn_sub_words:
+.L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L026aw_finish
+.L027aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L027aw_loop
+.L026aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L028aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L028aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L028aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L028aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L028aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L028aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L028aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L028aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_words,.-.L_bn_sub_words_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/bn-586-win.asm b/gen/bcm/bn-586-win.asm
new file mode 100644
index 0000000..f7ddfa8
--- /dev/null
+++ b/gen/bcm/bn-586-win.asm
@@ -0,0 +1,982 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_OPENSSL_ia32cap_P
+global	_bn_mul_add_words
+align	16
+_bn_mul_add_words:
+L$_bn_mul_add_words_begin:
+	lea	eax,[_OPENSSL_ia32cap_P]
+	bt	DWORD [eax],26
+	jnc	NEAR L$000maw_non_sse2
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [8+esp]
+	mov	ecx,DWORD [12+esp]
+	movd	mm0,DWORD [16+esp]
+	pxor	mm1,mm1
+	jmp	NEAR L$001maw_sse2_entry
+align	16
+L$002maw_sse2_unrolled:
+	movd	mm3,DWORD [eax]
+	paddq	mm1,mm3
+	movd	mm2,DWORD [edx]
+	pmuludq	mm2,mm0
+	movd	mm4,DWORD [4+edx]
+	pmuludq	mm4,mm0
+	movd	mm6,DWORD [8+edx]
+	pmuludq	mm6,mm0
+	movd	mm7,DWORD [12+edx]
+	pmuludq	mm7,mm0
+	paddq	mm1,mm2
+	movd	mm3,DWORD [4+eax]
+	paddq	mm3,mm4
+	movd	mm5,DWORD [8+eax]
+	paddq	mm5,mm6
+	movd	mm4,DWORD [12+eax]
+	paddq	mm7,mm4
+	movd	DWORD [eax],mm1
+	movd	mm2,DWORD [16+edx]
+	pmuludq	mm2,mm0
+	psrlq	mm1,32
+	movd	mm4,DWORD [20+edx]
+	pmuludq	mm4,mm0
+	paddq	mm1,mm3
+	movd	mm6,DWORD [24+edx]
+	pmuludq	mm6,mm0
+	movd	DWORD [4+eax],mm1
+	psrlq	mm1,32
+	movd	mm3,DWORD [28+edx]
+	add	edx,32
+	pmuludq	mm3,mm0
+	paddq	mm1,mm5
+	movd	mm5,DWORD [16+eax]
+	paddq	mm2,mm5
+	movd	DWORD [8+eax],mm1
+	psrlq	mm1,32
+	paddq	mm1,mm7
+	movd	mm5,DWORD [20+eax]
+	paddq	mm4,mm5
+	movd	DWORD [12+eax],mm1
+	psrlq	mm1,32
+	paddq	mm1,mm2
+	movd	mm5,DWORD [24+eax]
+	paddq	mm6,mm5
+	movd	DWORD [16+eax],mm1
+	psrlq	mm1,32
+	paddq	mm1,mm4
+	movd	mm5,DWORD [28+eax]
+	paddq	mm3,mm5
+	movd	DWORD [20+eax],mm1
+	psrlq	mm1,32
+	paddq	mm1,mm6
+	movd	DWORD [24+eax],mm1
+	psrlq	mm1,32
+	paddq	mm1,mm3
+	movd	DWORD [28+eax],mm1
+	lea	eax,[32+eax]
+	psrlq	mm1,32
+	sub	ecx,8
+	jz	NEAR L$003maw_sse2_exit
+L$001maw_sse2_entry:
+	test	ecx,4294967288
+	jnz	NEAR L$002maw_sse2_unrolled
+align	4
+L$004maw_sse2_loop:
+	movd	mm2,DWORD [edx]
+	movd	mm3,DWORD [eax]
+	pmuludq	mm2,mm0
+	lea	edx,[4+edx]
+	paddq	mm1,mm3
+	paddq	mm1,mm2
+	movd	DWORD [eax],mm1
+	sub	ecx,1
+	psrlq	mm1,32
+	lea	eax,[4+eax]
+	jnz	NEAR L$004maw_sse2_loop
+L$003maw_sse2_exit:
+	movd	eax,mm1
+	emms
+	ret
+align	16
+L$000maw_non_sse2:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	; 
+	xor	esi,esi
+	mov	edi,DWORD [20+esp]
+	mov	ecx,DWORD [28+esp]
+	mov	ebx,DWORD [24+esp]
+	and	ecx,4294967288
+	mov	ebp,DWORD [32+esp]
+	push	ecx
+	jz	NEAR L$005maw_finish
+align	16
+L$006maw_loop:
+	; Round 0
+	mov	eax,DWORD [ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [edi]
+	adc	edx,0
+	mov	DWORD [edi],eax
+	mov	esi,edx
+	; Round 4
+	mov	eax,DWORD [4+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [4+edi]
+	adc	edx,0
+	mov	DWORD [4+edi],eax
+	mov	esi,edx
+	; Round 8
+	mov	eax,DWORD [8+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [8+edi]
+	adc	edx,0
+	mov	DWORD [8+edi],eax
+	mov	esi,edx
+	; Round 12
+	mov	eax,DWORD [12+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [12+edi]
+	adc	edx,0
+	mov	DWORD [12+edi],eax
+	mov	esi,edx
+	; Round 16
+	mov	eax,DWORD [16+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [16+edi]
+	adc	edx,0
+	mov	DWORD [16+edi],eax
+	mov	esi,edx
+	; Round 20
+	mov	eax,DWORD [20+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [20+edi]
+	adc	edx,0
+	mov	DWORD [20+edi],eax
+	mov	esi,edx
+	; Round 24
+	mov	eax,DWORD [24+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [24+edi]
+	adc	edx,0
+	mov	DWORD [24+edi],eax
+	mov	esi,edx
+	; Round 28
+	mov	eax,DWORD [28+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [28+edi]
+	adc	edx,0
+	mov	DWORD [28+edi],eax
+	mov	esi,edx
+	; 
+	sub	ecx,8
+	lea	ebx,[32+ebx]
+	lea	edi,[32+edi]
+	jnz	NEAR L$006maw_loop
+L$005maw_finish:
+	mov	ecx,DWORD [32+esp]
+	and	ecx,7
+	jnz	NEAR L$007maw_finish2
+	jmp	NEAR L$008maw_end
+L$007maw_finish2:
+	; Tail Round 0
+	mov	eax,DWORD [ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 1
+	mov	eax,DWORD [4+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [4+edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [4+edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 2
+	mov	eax,DWORD [8+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [8+edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [8+edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 3
+	mov	eax,DWORD [12+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [12+edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [12+edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 4
+	mov	eax,DWORD [16+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [16+edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [16+edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 5
+	mov	eax,DWORD [20+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [20+edi]
+	adc	edx,0
+	dec	ecx
+	mov	DWORD [20+edi],eax
+	mov	esi,edx
+	jz	NEAR L$008maw_end
+	; Tail Round 6
+	mov	eax,DWORD [24+ebx]
+	mul	ebp
+	add	eax,esi
+	adc	edx,0
+	add	eax,DWORD [24+edi]
+	adc	edx,0
+	mov	DWORD [24+edi],eax
+	mov	esi,edx
+L$008maw_end:
+	mov	eax,esi
+	pop	ecx
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_bn_mul_words
+align	16
+_bn_mul_words:
+L$_bn_mul_words_begin:
+	lea	eax,[_OPENSSL_ia32cap_P]
+	bt	DWORD [eax],26
+	jnc	NEAR L$009mw_non_sse2
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [8+esp]
+	mov	ecx,DWORD [12+esp]
+	movd	mm0,DWORD [16+esp]
+	pxor	mm1,mm1
+align	16
+L$010mw_sse2_loop:
+	movd	mm2,DWORD [edx]
+	pmuludq	mm2,mm0
+	lea	edx,[4+edx]
+	paddq	mm1,mm2
+	movd	DWORD [eax],mm1
+	sub	ecx,1
+	psrlq	mm1,32
+	lea	eax,[4+eax]
+	jnz	NEAR L$010mw_sse2_loop
+	movd	eax,mm1
+	emms
+	ret
+align	16
+L$009mw_non_sse2:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	; 
+	xor	esi,esi
+	mov	edi,DWORD [20+esp]
+	mov	ebx,DWORD [24+esp]
+	mov	ebp,DWORD [28+esp]
+	mov	ecx,DWORD [32+esp]
+	and	ebp,4294967288
+	jz	NEAR L$011mw_finish
+L$012mw_loop:
+	; Round 0
+	mov	eax,DWORD [ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [edi],eax
+	mov	esi,edx
+	; Round 4
+	mov	eax,DWORD [4+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [4+edi],eax
+	mov	esi,edx
+	; Round 8
+	mov	eax,DWORD [8+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [8+edi],eax
+	mov	esi,edx
+	; Round 12
+	mov	eax,DWORD [12+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [12+edi],eax
+	mov	esi,edx
+	; Round 16
+	mov	eax,DWORD [16+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [16+edi],eax
+	mov	esi,edx
+	; Round 20
+	mov	eax,DWORD [20+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [20+edi],eax
+	mov	esi,edx
+	; Round 24
+	mov	eax,DWORD [24+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [24+edi],eax
+	mov	esi,edx
+	; Round 28
+	mov	eax,DWORD [28+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [28+edi],eax
+	mov	esi,edx
+	; 
+	add	ebx,32
+	add	edi,32
+	sub	ebp,8
+	jz	NEAR L$011mw_finish
+	jmp	NEAR L$012mw_loop
+L$011mw_finish:
+	mov	ebp,DWORD [28+esp]
+	and	ebp,7
+	jnz	NEAR L$013mw_finish2
+	jmp	NEAR L$014mw_end
+L$013mw_finish2:
+	; Tail Round 0
+	mov	eax,DWORD [ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 1
+	mov	eax,DWORD [4+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [4+edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 2
+	mov	eax,DWORD [8+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [8+edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 3
+	mov	eax,DWORD [12+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [12+edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 4
+	mov	eax,DWORD [16+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [16+edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 5
+	mov	eax,DWORD [20+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [20+edi],eax
+	mov	esi,edx
+	dec	ebp
+	jz	NEAR L$014mw_end
+	; Tail Round 6
+	mov	eax,DWORD [24+ebx]
+	mul	ecx
+	add	eax,esi
+	adc	edx,0
+	mov	DWORD [24+edi],eax
+	mov	esi,edx
+L$014mw_end:
+	mov	eax,esi
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_bn_sqr_words
+align	16
+_bn_sqr_words:
+L$_bn_sqr_words_begin:
+	lea	eax,[_OPENSSL_ia32cap_P]
+	bt	DWORD [eax],26
+	jnc	NEAR L$015sqr_non_sse2
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [8+esp]
+	mov	ecx,DWORD [12+esp]
+align	16
+L$016sqr_sse2_loop:
+	movd	mm0,DWORD [edx]
+	pmuludq	mm0,mm0
+	lea	edx,[4+edx]
+	movq	[eax],mm0
+	sub	ecx,1
+	lea	eax,[8+eax]
+	jnz	NEAR L$016sqr_sse2_loop
+	emms
+	ret
+align	16
+L$015sqr_non_sse2:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	; 
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	ebx,DWORD [28+esp]
+	and	ebx,4294967288
+	jz	NEAR L$017sw_finish
+L$018sw_loop:
+	; Round 0
+	mov	eax,DWORD [edi]
+	mul	eax
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],edx
+	; Round 4
+	mov	eax,DWORD [4+edi]
+	mul	eax
+	mov	DWORD [8+esi],eax
+	mov	DWORD [12+esi],edx
+	; Round 8
+	mov	eax,DWORD [8+edi]
+	mul	eax
+	mov	DWORD [16+esi],eax
+	mov	DWORD [20+esi],edx
+	; Round 12
+	mov	eax,DWORD [12+edi]
+	mul	eax
+	mov	DWORD [24+esi],eax
+	mov	DWORD [28+esi],edx
+	; Round 16
+	mov	eax,DWORD [16+edi]
+	mul	eax
+	mov	DWORD [32+esi],eax
+	mov	DWORD [36+esi],edx
+	; Round 20
+	mov	eax,DWORD [20+edi]
+	mul	eax
+	mov	DWORD [40+esi],eax
+	mov	DWORD [44+esi],edx
+	; Round 24
+	mov	eax,DWORD [24+edi]
+	mul	eax
+	mov	DWORD [48+esi],eax
+	mov	DWORD [52+esi],edx
+	; Round 28
+	mov	eax,DWORD [28+edi]
+	mul	eax
+	mov	DWORD [56+esi],eax
+	mov	DWORD [60+esi],edx
+	; 
+	add	edi,32
+	add	esi,64
+	sub	ebx,8
+	jnz	NEAR L$018sw_loop
+L$017sw_finish:
+	mov	ebx,DWORD [28+esp]
+	and	ebx,7
+	jz	NEAR L$019sw_end
+	; Tail Round 0
+	mov	eax,DWORD [edi]
+	mul	eax
+	mov	DWORD [esi],eax
+	dec	ebx
+	mov	DWORD [4+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 1
+	mov	eax,DWORD [4+edi]
+	mul	eax
+	mov	DWORD [8+esi],eax
+	dec	ebx
+	mov	DWORD [12+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 2
+	mov	eax,DWORD [8+edi]
+	mul	eax
+	mov	DWORD [16+esi],eax
+	dec	ebx
+	mov	DWORD [20+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 3
+	mov	eax,DWORD [12+edi]
+	mul	eax
+	mov	DWORD [24+esi],eax
+	dec	ebx
+	mov	DWORD [28+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 4
+	mov	eax,DWORD [16+edi]
+	mul	eax
+	mov	DWORD [32+esi],eax
+	dec	ebx
+	mov	DWORD [36+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 5
+	mov	eax,DWORD [20+edi]
+	mul	eax
+	mov	DWORD [40+esi],eax
+	dec	ebx
+	mov	DWORD [44+esi],edx
+	jz	NEAR L$019sw_end
+	; Tail Round 6
+	mov	eax,DWORD [24+edi]
+	mul	eax
+	mov	DWORD [48+esi],eax
+	mov	DWORD [52+esi],edx
+L$019sw_end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_bn_div_words
+align	16
+_bn_div_words:
+L$_bn_div_words_begin:
+	mov	edx,DWORD [4+esp]
+	mov	eax,DWORD [8+esp]
+	mov	ecx,DWORD [12+esp]
+	div	ecx
+	ret
+global	_bn_add_words
+align	16
+_bn_add_words:
+L$_bn_add_words_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	; 
+	mov	ebx,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	edi,DWORD [28+esp]
+	mov	ebp,DWORD [32+esp]
+	xor	eax,eax
+	and	ebp,4294967288
+	jz	NEAR L$020aw_finish
+L$021aw_loop:
+	; Round 0
+	mov	ecx,DWORD [esi]
+	mov	edx,DWORD [edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [ebx],ecx
+	; Round 1
+	mov	ecx,DWORD [4+esi]
+	mov	edx,DWORD [4+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [4+ebx],ecx
+	; Round 2
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [8+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [8+ebx],ecx
+	; Round 3
+	mov	ecx,DWORD [12+esi]
+	mov	edx,DWORD [12+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [12+ebx],ecx
+	; Round 4
+	mov	ecx,DWORD [16+esi]
+	mov	edx,DWORD [16+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [16+ebx],ecx
+	; Round 5
+	mov	ecx,DWORD [20+esi]
+	mov	edx,DWORD [20+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [20+ebx],ecx
+	; Round 6
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [24+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [24+ebx],ecx
+	; Round 7
+	mov	ecx,DWORD [28+esi]
+	mov	edx,DWORD [28+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [28+ebx],ecx
+	; 
+	add	esi,32
+	add	edi,32
+	add	ebx,32
+	sub	ebp,8
+	jnz	NEAR L$021aw_loop
+L$020aw_finish:
+	mov	ebp,DWORD [32+esp]
+	and	ebp,7
+	jz	NEAR L$022aw_end
+	; Tail Round 0
+	mov	ecx,DWORD [esi]
+	mov	edx,DWORD [edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 1
+	mov	ecx,DWORD [4+esi]
+	mov	edx,DWORD [4+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [4+ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 2
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [8+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [8+ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 3
+	mov	ecx,DWORD [12+esi]
+	mov	edx,DWORD [12+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [12+ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 4
+	mov	ecx,DWORD [16+esi]
+	mov	edx,DWORD [16+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [16+ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 5
+	mov	ecx,DWORD [20+esi]
+	mov	edx,DWORD [20+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [20+ebx],ecx
+	jz	NEAR L$022aw_end
+	; Tail Round 6
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [24+edi]
+	add	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	add	ecx,edx
+	adc	eax,0
+	mov	DWORD [24+ebx],ecx
+L$022aw_end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_bn_sub_words
+align	16
+_bn_sub_words:
+L$_bn_sub_words_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	; 
+	mov	ebx,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	edi,DWORD [28+esp]
+	mov	ebp,DWORD [32+esp]
+	xor	eax,eax
+	and	ebp,4294967288
+	jz	NEAR L$023aw_finish
+L$024aw_loop:
+	; Round 0
+	mov	ecx,DWORD [esi]
+	mov	edx,DWORD [edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [ebx],ecx
+	; Round 1
+	mov	ecx,DWORD [4+esi]
+	mov	edx,DWORD [4+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [4+ebx],ecx
+	; Round 2
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [8+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [8+ebx],ecx
+	; Round 3
+	mov	ecx,DWORD [12+esi]
+	mov	edx,DWORD [12+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [12+ebx],ecx
+	; Round 4
+	mov	ecx,DWORD [16+esi]
+	mov	edx,DWORD [16+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [16+ebx],ecx
+	; Round 5
+	mov	ecx,DWORD [20+esi]
+	mov	edx,DWORD [20+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [20+ebx],ecx
+	; Round 6
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [24+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [24+ebx],ecx
+	; Round 7
+	mov	ecx,DWORD [28+esi]
+	mov	edx,DWORD [28+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [28+ebx],ecx
+	; 
+	add	esi,32
+	add	edi,32
+	add	ebx,32
+	sub	ebp,8
+	jnz	NEAR L$024aw_loop
+L$023aw_finish:
+	mov	ebp,DWORD [32+esp]
+	and	ebp,7
+	jz	NEAR L$025aw_end
+	; Tail Round 0
+	mov	ecx,DWORD [esi]
+	mov	edx,DWORD [edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 1
+	mov	ecx,DWORD [4+esi]
+	mov	edx,DWORD [4+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [4+ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 2
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [8+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [8+ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 3
+	mov	ecx,DWORD [12+esi]
+	mov	edx,DWORD [12+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [12+ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 4
+	mov	ecx,DWORD [16+esi]
+	mov	edx,DWORD [16+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [16+ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 5
+	mov	ecx,DWORD [20+esi]
+	mov	edx,DWORD [20+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	dec	ebp
+	mov	DWORD [20+ebx],ecx
+	jz	NEAR L$025aw_end
+	; Tail Round 6
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [24+edi]
+	sub	ecx,eax
+	mov	eax,0
+	adc	eax,eax
+	sub	ecx,edx
+	adc	eax,0
+	mov	DWORD [24+ebx],ecx
+L$025aw_end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+segment	.bss
+common	_OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/bn-armv8-apple.S b/gen/bcm/bn-armv8-apple.S
new file mode 100644
index 0000000..5e3471a
--- /dev/null
+++ b/gen/bcm/bn-armv8-apple.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	_bn_add_words
+.private_extern	_bn_add_words
+.align	4
+_bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Ladd_tail
+Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Ladd_loop
+
+Ladd_tail:
+	cbz	x3, Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Ladd_exit:
+	cset	x0, cs
+	ret
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	_bn_sub_words
+.private_extern	_bn_sub_words
+.align	4
+_bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Lsub_tail
+Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Lsub_loop
+
+Lsub_tail:
+	cbz	x3, Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Lsub_exit:
+	cset	x0, cc
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/bn-armv8-linux.S b/gen/bcm/bn-armv8-linux.S
new file mode 100644
index 0000000..2b8823a
--- /dev/null
+++ b/gen/bcm/bn-armv8-linux.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_add_words, %function
+.globl	bn_add_words
+.hidden	bn_add_words
+.align	4
+bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, .Ladd_tail
+.Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, .Ladd_loop
+
+.Ladd_tail:
+	cbz	x3, .Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+.Ladd_exit:
+	cset	x0, cs
+	ret
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_sub_words, %function
+.globl	bn_sub_words
+.hidden	bn_sub_words
+.align	4
+bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, .Lsub_tail
+.Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, .Lsub_loop
+
+.Lsub_tail:
+	cbz	x3, .Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+.Lsub_exit:
+	cset	x0, cc
+	ret
+.size	bn_sub_words,.-bn_sub_words
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/bn-armv8-win.S b/gen/bcm/bn-armv8-win.S
new file mode 100644
index 0000000..af97080
--- /dev/null
+++ b/gen/bcm/bn-armv8-win.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_add_words
+
+.align	4
+bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Ladd_tail
+Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Ladd_loop
+
+Ladd_tail:
+	cbz	x3, Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Ladd_exit:
+	cset	x0, cs
+	ret
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_sub_words
+
+.align	4
+bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Lsub_tail
+Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Lsub_loop
+
+Lsub_tail:
+	cbz	x3, Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Lsub_exit:
+	cset	x0, cc
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/bsaes-armv7-linux.S b/gen/bcm/bsaes-armv7-linux.S
new file mode 100644
index 0000000..01a9ead
--- /dev/null
+++ b/gen/bcm/bsaes-armv7-linux.S
@@ -0,0 +1,1517 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code	32
+# undef __thumb2__
+#endif
+
+.type	_bsaes_decrypt8,%function
+.align	4
+_bsaes_decrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0ISR
+#else
+	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
+#endif
+
+	vldmia	r6!, {q8}		@ .LM0ISR
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	.Ldec_sbox
+.align	4
+.Ldec_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Ldec_sbox:
+	veor	q1, q1, q4
+	veor	q3, q3, q4
+
+	veor	q4, q4, q7
+	veor	q1, q1, q6
+	veor	q2, q2, q7
+	veor	q6, q6, q4
+
+	veor	q0, q0, q1
+	veor	q2, q2, q5
+	veor	q7, q7, q6
+	veor	q3, q3, q0
+	veor	q5, q5, q0
+	veor	q1, q1, q3
+	veor	q11, q3, q0
+	veor	q10, q7, q4
+	veor	q9, q1, q6
+	veor	q13, q4, q0
+	vmov	q8, q10
+	veor	q12, q5, q2
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q6, q2
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q3, q7
+	veor	q12, q1, q5
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q4, q6
+	veor	q9, q9, q14
+	vand	q13, q0, q2
+	vand	q14, q7, q1
+	vorr	q15, q3, q5
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q5, q2
+	veor	q8, q1, q6
+	veor	q10, q15, q14
+	vand	q10, q10, q5
+	veor	q5, q5, q1
+	vand	q11, q1, q15
+	vand	q5, q5, q14
+	veor	q1, q11, q10
+	veor	q5, q5, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q2
+	veor	q12, q12, q8
+	veor	q2, q2, q6
+	vand	q8, q8, q15
+	vand	q6, q6, q13
+	vand	q12, q12, q14
+	vand	q2, q2, q9
+	veor	q8, q8, q12
+	veor	q2, q2, q6
+	veor	q12, q12, q11
+	veor	q6, q6, q10
+	veor	q5, q5, q12
+	veor	q2, q2, q12
+	veor	q1, q1, q8
+	veor	q6, q6, q8
+
+	veor	q12, q3, q0
+	veor	q8, q7, q4
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q4
+	vand	q8, q8, q15
+	vand	q4, q4, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q4
+	veor	q12, q12, q11
+	veor	q4, q4, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q3
+	veor	q3, q3, q7
+	vand	q11, q7, q15
+	vand	q3, q3, q14
+	veor	q7, q11, q10
+	veor	q3, q3, q11
+	veor	q3, q3, q12
+	veor	q0, q0, q12
+	veor	q7, q7, q8
+	veor	q4, q4, q8
+	veor	q1, q1, q7
+	veor	q6, q6, q5
+
+	veor	q4, q4, q1
+	veor	q2, q2, q7
+	veor	q5, q5, q7
+	veor	q4, q4, q2
+	veor	q7, q7, q0
+	veor	q4, q4, q5
+	veor	q3, q3, q6
+	veor	q6, q6, q1
+	veor	q3, q3, q4
+
+	veor	q4, q4, q0
+	veor	q7, q7, q3
+	subs	r5,r5,#1
+	bcc	.Ldec_done
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	q8, q0, q0, #8
+	vext.8	q14, q3, q3, #8
+	vext.8	q15, q5, q5, #8
+	veor	q8, q8, q0
+	vext.8	q9, q1, q1, #8
+	veor	q14, q14, q3
+	vext.8	q10, q6, q6, #8
+	veor	q15, q15, q5
+	vext.8	q11, q4, q4, #8
+	veor	q9, q9, q1
+	vext.8	q12, q2, q2, #8
+	veor	q10, q10, q6
+	vext.8	q13, q7, q7, #8
+	veor	q11, q11, q4
+	veor	q12, q12, q2
+	veor	q13, q13, q7
+
+	veor	q0, q0, q14
+	veor	q1, q1, q14
+	veor	q6, q6, q8
+	veor	q2, q2, q10
+	veor	q4, q4, q9
+	veor	q1, q1, q15
+	veor	q6, q6, q15
+	veor	q2, q2, q14
+	veor	q7, q7, q11
+	veor	q4, q4, q14
+	veor	q3, q3, q12
+	veor	q2, q2, q15
+	veor	q7, q7, q15
+	veor	q5, q5, q13
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q6, q6, #12
+	veor	q1, q1, q9
+	vext.8	q11, q4, q4, #12
+	veor	q6, q6, q10
+	vext.8	q12, q2, q2, #12
+	veor	q4, q4, q11
+	vext.8	q13, q7, q7, #12
+	veor	q2, q2, q12
+	vext.8	q14, q3, q3, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q3, q3, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q2
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q2, q2, #8
+	veor	q12, q12, q4
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q3
+	vext.8	q2, q4, q4, #8
+	veor	q11, q11, q6
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q4, q3, q3, #8
+	veor	q11, q11, q5
+	vext.8	q3, q6, q6, #8
+	veor	q5, q9, q13
+	veor	q11, q11, q2
+	veor	q7, q7, q15
+	veor	q6, q4, q14
+	veor	q4, q8, q12
+	veor	q2, q3, q10
+	vmov	q3, q11
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Ldec_loop
+	vldmia	r6, {q12}		@ .LISRM0
+	b	.Ldec_loop
+.align	4
+.Ldec_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q3, #1
+	vshr.u64	q11, q2, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q4
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q2, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q4
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q4, #4
+	vshr.u64	q11, q6, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q4, q4, q10
+	veor	q6, q6, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q6, q6, q8
+	veor	q4, q4, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q8
+	veor	q3, q3, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:@ InvShiftRows constants
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:@ ShiftRows constants
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0SR
+#else
+	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
+#endif
+
+	vldmia	r6!, {q8}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q3
+	vand	q8, q8, q15
+	vand	q3, q3, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q3
+	veor	q12, q12, q11
+	veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q4
+	veor	q12, q12, q8
+	veor	q4, q4, q2
+	vand	q8, q8, q15
+	vand	q2, q2, q13
+	vand	q12, q12, q14
+	vand	q4, q4, q9
+	veor	q8, q8, q12
+	veor	q4, q4, q2
+	veor	q12, q12, q11
+	veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	.Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Lenc_loop
+	vldmia	r6, {q12}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q3, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q4, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q6
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q4, q4, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q3, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q6
+	veor	q11, q11, q4
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #2
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q6, #4
+	vshr.u64	q11, q4, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	r6,.
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0
+#else
+	sub	r6,r6,#_bsaes_key_convert-.LM0
+#endif
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+.globl	bsaes_cbc_encrypt
+.hidden	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,%function
+.align	5
+bsaes_cbc_encrypt:
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ IV is 1st arg on the stack
+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	vldmia	sp, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	sp, {q7}
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r3, #248
+	vldmia	r4, {q6}
+	vstmia	r12, {q15}			@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+
+#endif
+
+	vld1.8	{q15}, [r8]		@ load IV
+	b	.Lcbc_dec_loop
+
+.align	4
+.Lcbc_dec_loop:
+	subs	r2, r2, #0x8
+	bmi	.Lcbc_dec_loop_finish
+
+	vld1.8	{q0,q1}, [r0]!	@ load input
+	vld1.8	{q2,q3}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	vld1.8	{q4,q5}, [r0]!
+	mov	r5, r10
+	vld1.8	{q6,q7}, [r0]
+	sub	r0, r0, #0x60
+	vstmia	r9, {q15}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	veor	q5, q5, q14
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	vst1.8	{q5}, [r1]!
+
+	b	.Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+	adds	r2, r2, #8
+	beq	.Lcbc_dec_done
+
+	@ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	mov	r5, r10
+	vstmia	r9, {q15}			@ put aside IV
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
+	beq	.Lcbc_dec_two
+	vld1.8	{q2}, [r0]!
+	cmp	r2, #4
+	blo	.Lcbc_dec_three
+	vld1.8	{q3}, [r0]!
+	beq	.Lcbc_dec_four
+	vld1.8	{q4}, [r0]!
+	cmp	r2, #6
+	blo	.Lcbc_dec_five
+	vld1.8	{q5}, [r0]!
+	beq	.Lcbc_dec_six
+	vld1.8	{q6}, [r0]!
+	sub	r0, r0, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_six:
+	sub	r0, r0, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	r9,{q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_five:
+	sub	r0, r0, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q2, q2, q11
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_four:
+	sub	r0, r0, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_three:
+	sub	r0, r0, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_two:
+	sub	r0, r0, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q1, q1, q8
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_one:
+	sub	r0, r0, #0x10
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vst1.8	{q0}, [r1]!		@ write output
+
+.Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+.Lcbc_dec_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	.Lcbc_dec_bzero
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
+	vst1.8	{q15}, [r8]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.globl	bsaes_ctr32_encrypt_blocks
+.hidden	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+#ifdef	__APPLE__
+	mov	r8, #:lower16:(.LREVM0SR-.LM0)
+	add	r8, r6, r8
+#else
+	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
+#endif
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, .LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor	q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8	q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia	sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x10		@ pass next round key
+#else
+	add	r4, r3, #264
+#endif
+	vldmia	r8, {q8}			@ .LREVM0SR
+	mov	r5, r10			@ pass rounds
+	vstmia	r9, {q10}			@ save next counter
+#ifdef	__APPLE__
+	mov	r6, #:lower16:(.LREVM0SR-.LSR)
+	sub	r6, r8, r6
+#else
+	sub	r6, r8, #.LREVM0SR-.LSR	@ pass constants
+#endif
+
+	bl	_bsaes_encrypt8_alt
+
+	subs	r2, r2, #8
+	blo	.Lctr_enc_loop_done
+
+	vld1.8	{q8,q9}, [r0]!	@ load input
+	vld1.8	{q10,q11}, [r0]!
+	veor	q0, q8
+	veor	q1, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q10
+	veor	q6, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q3, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q7, q13
+	veor	q2, q14
+	vst1.8	{q4}, [r1]!
+	veor	q5, q15
+	vst1.8	{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8	{q3}, [r1]!
+	veor	q9, q9, q9
+	vst1.8	{q7}, [r1]!
+	vext.8	q8, q9, q8, #4
+	vst1.8	{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8	{q5}, [r1]!
+	vldmia	r9, {q0}			@ load counter
+
+	bne	.Lctr_enc_loop
+	b	.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add	r2, r2, #8
+	vld1.8	{q8}, [r0]!	@ load input
+	veor	q0, q8
+	vst1.8	{q0}, [r1]!	@ write output
+	cmp	r2, #2
+	blo	.Lctr_enc_done
+	vld1.8	{q9}, [r0]!
+	veor	q1, q9
+	vst1.8	{q1}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q10}, [r0]!
+	veor	q4, q10
+	vst1.8	{q4}, [r1]!
+	cmp	r2, #4
+	blo	.Lctr_enc_done
+	vld1.8	{q11}, [r0]!
+	veor	q6, q11
+	vst1.8	{q6}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q12}, [r0]!
+	veor	q3, q12
+	vst1.8	{q3}, [r1]!
+	cmp	r2, #6
+	blo	.Lctr_enc_done
+	vld1.8	{q13}, [r0]!
+	veor	q7, q13
+	vst1.8	{q7}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q14}, [r0]
+	veor	q2, q14
+	vst1.8	{q2}, [r1]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	.Lctr_enc_bzero
+#else
+	vstmia	sp, {q0,q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/co-586-apple.S b/gen/bcm/co-586-apple.S
new file mode 100644
index 0000000..ab985ee
--- /dev/null
+++ b/gen/bcm/co-586-apple.S
@@ -0,0 +1,1256 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_comba8
+.private_extern	_bn_mul_comba8
+.align	4
+_bn_mul_comba8:
+L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+	# ################## Calculate word 0 
+	xorl	%ebp,%ebp
+	# mul a[0]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ################## Calculate word 1 
+	xorl	%ebx,%ebx
+	# mul a[1]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+	# saved r[1] 
+	# ################## Calculate word 2 
+	xorl	%ecx,%ecx
+	# mul a[2]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ################## Calculate word 3 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+	# saved r[3] 
+	# ################## Calculate word 4 
+	xorl	%ebx,%ebx
+	# mul a[4]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+	# saved r[4] 
+	# ################## Calculate word 5 
+	xorl	%ecx,%ecx
+	# mul a[5]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[3]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+	# saved r[5] 
+	# ################## Calculate word 6 
+	xorl	%ebp,%ebp
+	# mul a[6]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[4]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[3]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[4] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+	# saved r[6] 
+	# ################## Calculate word 7 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[5]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[4]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[5] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+	# saved r[7] 
+	# ################## Calculate word 8 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[6]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[5]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[3]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[6] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+	# saved r[8] 
+	# ################## Calculate word 9 
+	xorl	%ebp,%ebp
+	# mul a[7]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[6]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[4] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[4]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[3]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[7] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+	# saved r[9] 
+	# ################## Calculate word 10 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[5]*b[5] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[4]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+	# saved r[10] 
+	# ################## Calculate word 11 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[6]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[5]*b[6] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+	# saved r[11] 
+	# ################## Calculate word 12 
+	xorl	%ebp,%ebp
+	# mul a[7]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[6]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[7] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+	# saved r[12] 
+	# ################## Calculate word 13 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+	# saved r[13] 
+	# ################## Calculate word 14 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+	# saved r[14] 
+	# save r[15] 
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_mul_comba4
+.private_extern	_bn_mul_comba4
+.align	4
+_bn_mul_comba4:
+L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+	# ################## Calculate word 0 
+	xorl	%ebp,%ebp
+	# mul a[0]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ################## Calculate word 1 
+	xorl	%ebx,%ebx
+	# mul a[1]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+	# saved r[1] 
+	# ################## Calculate word 2 
+	xorl	%ecx,%ecx
+	# mul a[2]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ################## Calculate word 3 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+	# saved r[3] 
+	# ################## Calculate word 4 
+	xorl	%ebx,%ebx
+	# mul a[3]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+	# saved r[4] 
+	# ################## Calculate word 5 
+	xorl	%ecx,%ecx
+	# mul a[3]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+	# saved r[5] 
+	# ################## Calculate word 6 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	# saved r[6] 
+	# save r[7] 
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_sqr_comba8
+.private_extern	_bn_sqr_comba8
+.align	4
+_bn_sqr_comba8:
+L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+	# ############### Calculate word 0 
+	xorl	%ebp,%ebp
+	# sqr a[0]*a[0] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ############### Calculate word 1 
+	xorl	%ebx,%ebx
+	# sqr a[1]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+	# saved r[1] 
+	# ############### Calculate word 2 
+	xorl	%ecx,%ecx
+	# sqr a[2]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[1]*a[1] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ############### Calculate word 3 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[2]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+	# saved r[3] 
+	# ############### Calculate word 4 
+	xorl	%ebx,%ebx
+	# sqr a[4]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+	# sqr a[3]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[2]*a[2] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+	# saved r[4] 
+	# ############### Calculate word 5 
+	xorl	%ecx,%ecx
+	# sqr a[5]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+	# sqr a[4]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+	# sqr a[3]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+	# saved r[5] 
+	# ############### Calculate word 6 
+	xorl	%ebp,%ebp
+	# sqr a[6]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[5]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+	# sqr a[4]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	# sqr a[3]*a[3] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+	# saved r[6] 
+	# ############### Calculate word 7 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+	# sqr a[6]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+	# sqr a[5]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+	# sqr a[4]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+	# saved r[7] 
+	# ############### Calculate word 8 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+	# sqr a[6]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+	# sqr a[5]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[4]*a[4] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+	# saved r[8] 
+	# ############### Calculate word 9 
+	xorl	%ebp,%ebp
+	# sqr a[7]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+	# sqr a[6]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+	# sqr a[5]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+	# saved r[9] 
+	# ############### Calculate word 10 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+	# sqr a[6]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[5]*a[5] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+	# saved r[10] 
+	# ############### Calculate word 11 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+	# sqr a[6]*a[5] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+	# saved r[11] 
+	# ############### Calculate word 12 
+	xorl	%ebp,%ebp
+	# sqr a[7]*a[5] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	# sqr a[6]*a[6] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+	# saved r[12] 
+	# ############### Calculate word 13 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[6] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+	# saved r[13] 
+	# ############### Calculate word 14 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[7] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+	# saved r[14] 
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_sqr_comba4
+.private_extern	_bn_sqr_comba4
+.align	4
+_bn_sqr_comba4:
+L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+	# ############### Calculate word 0 
+	xorl	%ebp,%ebp
+	# sqr a[0]*a[0] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ############### Calculate word 1 
+	xorl	%ebx,%ebx
+	# sqr a[1]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+	# saved r[1] 
+	# ############### Calculate word 2 
+	xorl	%ecx,%ecx
+	# sqr a[2]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[1]*a[1] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ############### Calculate word 3 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[2]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+	# saved r[3] 
+	# ############### Calculate word 4 
+	xorl	%ebx,%ebx
+	# sqr a[3]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[2]*a[2] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+	# saved r[4] 
+	# ############### Calculate word 5 
+	xorl	%ecx,%ecx
+	# sqr a[3]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	# saved r[5] 
+	# ############### Calculate word 6 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[3] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	# saved r[6] 
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/co-586-linux.S b/gen/bcm/co-586-linux.S
new file mode 100644
index 0000000..b4812e3
--- /dev/null
+++ b/gen/bcm/co-586-linux.S
@@ -0,0 +1,1264 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	bn_mul_comba8
+.hidden	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+
+
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl	bn_mul_comba4
+.hidden	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+
+
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl	bn_sqr_comba8
+.hidden	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+
+
+	xorl	%ecx,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl	bn_sqr_comba4
+.hidden	bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/co-586-win.asm b/gen/bcm/co-586-win.asm
new file mode 100644
index 0000000..6ad4696
--- /dev/null
+++ b/gen/bcm/co-586-win.asm
@@ -0,0 +1,1263 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_bn_mul_comba8
+align	16
+_bn_mul_comba8:
+L$_bn_mul_comba8_begin:
+	push	esi
+	mov	esi,DWORD [12+esp]
+	push	edi
+	mov	edi,DWORD [20+esp]
+	push	ebp
+	push	ebx
+	xor	ebx,ebx
+	mov	eax,DWORD [esi]
+	xor	ecx,ecx
+	mov	edx,DWORD [edi]
+	; ################## Calculate word 0
+	xor	ebp,ebp
+	; mul a[0]*b[0]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [edi]
+	adc	ebp,0
+	mov	DWORD [eax],ebx
+	mov	eax,DWORD [4+esi]
+	; saved r[0]
+	; ################## Calculate word 1
+	xor	ebx,ebx
+	; mul a[1]*b[0]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [esi]
+	adc	ebp,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebx,0
+	; mul a[0]*b[1]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [edi]
+	adc	ebx,0
+	mov	DWORD [4+eax],ecx
+	mov	eax,DWORD [8+esi]
+	; saved r[1]
+	; ################## Calculate word 2
+	xor	ecx,ecx
+	; mul a[2]*b[0]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ecx,0
+	; mul a[1]*b[1]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [esi]
+	adc	ebx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ecx,0
+	; mul a[0]*b[2]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [edi]
+	adc	ecx,0
+	mov	DWORD [8+eax],ebp
+	mov	eax,DWORD [12+esi]
+	; saved r[2]
+	; ################## Calculate word 3
+	xor	ebp,ebp
+	; mul a[3]*b[0]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebp,0
+	; mul a[2]*b[1]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebp,0
+	; mul a[1]*b[2]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [esi]
+	adc	ecx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebp,0
+	; mul a[0]*b[3]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [edi]
+	adc	ebp,0
+	mov	DWORD [12+eax],ebx
+	mov	eax,DWORD [16+esi]
+	; saved r[3]
+	; ################## Calculate word 4
+	xor	ebx,ebx
+	; mul a[4]*b[0]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [12+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebx,0
+	; mul a[3]*b[1]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebx,0
+	; mul a[2]*b[2]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebx,0
+	; mul a[1]*b[3]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [esi]
+	adc	ebp,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebx,0
+	; mul a[0]*b[4]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [edi]
+	adc	ebx,0
+	mov	DWORD [16+eax],ecx
+	mov	eax,DWORD [20+esi]
+	; saved r[4]
+	; ################## Calculate word 5
+	xor	ecx,ecx
+	; mul a[5]*b[0]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [16+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ecx,0
+	; mul a[4]*b[1]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [12+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ecx,0
+	; mul a[3]*b[2]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ecx,0
+	; mul a[2]*b[3]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [16+edi]
+	adc	ecx,0
+	; mul a[1]*b[4]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [esi]
+	adc	ebx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ecx,0
+	; mul a[0]*b[5]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [edi]
+	adc	ecx,0
+	mov	DWORD [20+eax],ebp
+	mov	eax,DWORD [24+esi]
+	; saved r[5]
+	; ################## Calculate word 6
+	xor	ebp,ebp
+	; mul a[6]*b[0]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebp,0
+	; mul a[5]*b[1]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [16+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebp,0
+	; mul a[4]*b[2]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [12+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebp,0
+	; mul a[3]*b[3]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebp,0
+	; mul a[2]*b[4]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ebp,0
+	; mul a[1]*b[5]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [esi]
+	adc	ecx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebp,0
+	; mul a[0]*b[6]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [edi]
+	adc	ebp,0
+	mov	DWORD [24+eax],ebx
+	mov	eax,DWORD [28+esi]
+	; saved r[6]
+	; ################## Calculate word 7
+	xor	ebx,ebx
+	; mul a[7]*b[0]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [24+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebx,0
+	; mul a[6]*b[1]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebx,0
+	; mul a[5]*b[2]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [16+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebx,0
+	; mul a[4]*b[3]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [12+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebx,0
+	; mul a[3]*b[4]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [20+edi]
+	adc	ebx,0
+	; mul a[2]*b[5]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebx,0
+	; mul a[1]*b[6]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [esi]
+	adc	ebp,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebx,0
+	; mul a[0]*b[7]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebx,0
+	mov	DWORD [28+eax],ecx
+	mov	eax,DWORD [28+esi]
+	; saved r[7]
+	; ################## Calculate word 8
+	xor	ecx,ecx
+	; mul a[7]*b[1]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [24+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ecx,0
+	; mul a[6]*b[2]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ecx,0
+	; mul a[5]*b[3]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [16+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [16+edi]
+	adc	ecx,0
+	; mul a[4]*b[4]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [12+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ecx,0
+	; mul a[3]*b[5]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ecx,0
+	; mul a[2]*b[6]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [28+edi]
+	adc	ecx,0
+	; mul a[1]*b[7]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ecx,0
+	mov	DWORD [32+eax],ebp
+	mov	eax,DWORD [28+esi]
+	; saved r[8]
+	; ################## Calculate word 9
+	xor	ebp,ebp
+	; mul a[7]*b[2]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [24+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebp,0
+	; mul a[6]*b[3]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebp,0
+	; mul a[5]*b[4]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [16+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ebp,0
+	; mul a[4]*b[5]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [12+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebp,0
+	; mul a[3]*b[6]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebp,0
+	; mul a[2]*b[7]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebp,0
+	mov	DWORD [36+eax],ebx
+	mov	eax,DWORD [28+esi]
+	; saved r[9]
+	; ################## Calculate word 10
+	xor	ebx,ebx
+	; mul a[7]*b[3]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [24+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebx,0
+	; mul a[6]*b[4]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [20+edi]
+	adc	ebx,0
+	; mul a[5]*b[5]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [16+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebx,0
+	; mul a[4]*b[6]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [12+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebx,0
+	; mul a[3]*b[7]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [16+edi]
+	adc	ebx,0
+	mov	DWORD [40+eax],ecx
+	mov	eax,DWORD [28+esi]
+	; saved r[10]
+	; ################## Calculate word 11
+	xor	ecx,ecx
+	; mul a[7]*b[4]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [24+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ecx,0
+	; mul a[6]*b[5]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ecx,0
+	; mul a[5]*b[6]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [16+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [28+edi]
+	adc	ecx,0
+	; mul a[4]*b[7]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [20+edi]
+	adc	ecx,0
+	mov	DWORD [44+eax],ebp
+	mov	eax,DWORD [28+esi]
+	; saved r[11]
+	; ################## Calculate word 12
+	xor	ebp,ebp
+	; mul a[7]*b[5]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [24+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebp,0
+	; mul a[6]*b[6]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebp,0
+	; mul a[5]*b[7]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [24+edi]
+	adc	ebp,0
+	mov	DWORD [48+eax],ebx
+	mov	eax,DWORD [28+esi]
+	; saved r[12]
+	; ################## Calculate word 13
+	xor	ebx,ebx
+	; mul a[7]*b[6]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [24+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebx,0
+	; mul a[6]*b[7]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [28+edi]
+	adc	ebx,0
+	mov	DWORD [52+eax],ecx
+	mov	eax,DWORD [28+esi]
+	; saved r[13]
+	; ################## Calculate word 14
+	xor	ecx,ecx
+	; mul a[7]*b[7]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	adc	ecx,0
+	mov	DWORD [56+eax],ebp
+	; saved r[14]
+	; save r[15]
+	mov	DWORD [60+eax],ebx
+	pop	ebx
+	pop	ebp
+	pop	edi
+	pop	esi
+	ret
+global	_bn_mul_comba4
+align	16
+_bn_mul_comba4:
+L$_bn_mul_comba4_begin:
+	push	esi
+	mov	esi,DWORD [12+esp]
+	push	edi
+	mov	edi,DWORD [20+esp]
+	push	ebp
+	push	ebx
+	xor	ebx,ebx
+	mov	eax,DWORD [esi]
+	xor	ecx,ecx
+	mov	edx,DWORD [edi]
+	; ################## Calculate word 0
+	xor	ebp,ebp
+	; mul a[0]*b[0]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [edi]
+	adc	ebp,0
+	mov	DWORD [eax],ebx
+	mov	eax,DWORD [4+esi]
+	; saved r[0]
+	; ################## Calculate word 1
+	xor	ebx,ebx
+	; mul a[1]*b[0]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [esi]
+	adc	ebp,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebx,0
+	; mul a[0]*b[1]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [edi]
+	adc	ebx,0
+	mov	DWORD [4+eax],ecx
+	mov	eax,DWORD [8+esi]
+	; saved r[1]
+	; ################## Calculate word 2
+	xor	ecx,ecx
+	; mul a[2]*b[0]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ecx,0
+	; mul a[1]*b[1]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [esi]
+	adc	ebx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ecx,0
+	; mul a[0]*b[2]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [edi]
+	adc	ecx,0
+	mov	DWORD [8+eax],ebp
+	mov	eax,DWORD [12+esi]
+	; saved r[2]
+	; ################## Calculate word 3
+	xor	ebp,ebp
+	; mul a[3]*b[0]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebp,0
+	; mul a[2]*b[1]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ecx,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebp,0
+	; mul a[1]*b[2]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [esi]
+	adc	ecx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebp,0
+	; mul a[0]*b[3]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	mov	edx,DWORD [4+edi]
+	adc	ebp,0
+	mov	DWORD [12+eax],ebx
+	mov	eax,DWORD [12+esi]
+	; saved r[3]
+	; ################## Calculate word 4
+	xor	ebx,ebx
+	; mul a[3]*b[1]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebx,0
+	; mul a[2]*b[2]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [4+esi]
+	adc	ebp,edx
+	mov	edx,DWORD [12+edi]
+	adc	ebx,0
+	; mul a[1]*b[3]
+	mul	edx
+	add	ecx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebp,edx
+	mov	edx,DWORD [8+edi]
+	adc	ebx,0
+	mov	DWORD [16+eax],ecx
+	mov	eax,DWORD [12+esi]
+	; saved r[4]
+	; ################## Calculate word 5
+	xor	ecx,ecx
+	; mul a[3]*b[2]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [8+esi]
+	adc	ebx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ecx,0
+	; mul a[2]*b[3]
+	mul	edx
+	add	ebp,eax
+	mov	eax,DWORD [20+esp]
+	adc	ebx,edx
+	mov	edx,DWORD [12+edi]
+	adc	ecx,0
+	mov	DWORD [20+eax],ebp
+	mov	eax,DWORD [12+esi]
+	; saved r[5]
+	; ################## Calculate word 6
+	xor	ebp,ebp
+	; mul a[3]*b[3]
+	mul	edx
+	add	ebx,eax
+	mov	eax,DWORD [20+esp]
+	adc	ecx,edx
+	adc	ebp,0
+	mov	DWORD [24+eax],ebx
+	; saved r[6]
+	; save r[7]
+	mov	DWORD [28+eax],ecx
+	pop	ebx
+	pop	ebp
+	pop	edi
+	pop	esi
+	ret
+global	_bn_sqr_comba8
+align	16
+_bn_sqr_comba8:
+L$_bn_sqr_comba8_begin:
+	push	esi
+	push	edi
+	push	ebp
+	push	ebx
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	xor	ebx,ebx
+	xor	ecx,ecx
+	mov	eax,DWORD [esi]
+	; ############### Calculate word 0
+	xor	ebp,ebp
+	; sqr a[0]*a[0]
+	mul	eax
+	add	ebx,eax
+	adc	ecx,edx
+	mov	edx,DWORD [esi]
+	adc	ebp,0
+	mov	DWORD [edi],ebx
+	mov	eax,DWORD [4+esi]
+	; saved r[0]
+	; ############### Calculate word 1
+	xor	ebx,ebx
+	; sqr a[1]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebx,0
+	mov	DWORD [4+edi],ecx
+	mov	edx,DWORD [esi]
+	; saved r[1]
+	; ############### Calculate word 2
+	xor	ecx,ecx
+	; sqr a[2]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [4+esi]
+	adc	ecx,0
+	; sqr a[1]*a[1]
+	mul	eax
+	add	ebp,eax
+	adc	ebx,edx
+	mov	edx,DWORD [esi]
+	adc	ecx,0
+	mov	DWORD [8+edi],ebp
+	mov	eax,DWORD [12+esi]
+	; saved r[2]
+	; ############### Calculate word 3
+	xor	ebp,ebp
+	; sqr a[3]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebp,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[2]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [16+esi]
+	adc	ebp,0
+	mov	DWORD [12+edi],ebx
+	mov	edx,DWORD [esi]
+	; saved r[3]
+	; ############### Calculate word 4
+	xor	ebx,ebx
+	; sqr a[4]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [12+esi]
+	adc	ebx,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[3]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebx,0
+	; sqr a[2]*a[2]
+	mul	eax
+	add	ecx,eax
+	adc	ebp,edx
+	mov	edx,DWORD [esi]
+	adc	ebx,0
+	mov	DWORD [16+edi],ecx
+	mov	eax,DWORD [20+esi]
+	; saved r[4]
+	; ############### Calculate word 5
+	xor	ecx,ecx
+	; sqr a[5]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [16+esi]
+	adc	ecx,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[4]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [12+esi]
+	adc	ecx,0
+	mov	edx,DWORD [8+esi]
+	; sqr a[3]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [24+esi]
+	adc	ecx,0
+	mov	DWORD [20+edi],ebp
+	mov	edx,DWORD [esi]
+	; saved r[5]
+	; ############### Calculate word 6
+	xor	ebp,ebp
+	; sqr a[6]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [20+esi]
+	adc	ebp,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[5]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [16+esi]
+	adc	ebp,0
+	mov	edx,DWORD [8+esi]
+	; sqr a[4]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [12+esi]
+	adc	ebp,0
+	; sqr a[3]*a[3]
+	mul	eax
+	add	ebx,eax
+	adc	ecx,edx
+	mov	edx,DWORD [esi]
+	adc	ebp,0
+	mov	DWORD [24+edi],ebx
+	mov	eax,DWORD [28+esi]
+	; saved r[6]
+	; ############### Calculate word 7
+	xor	ebx,ebx
+	; sqr a[7]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [24+esi]
+	adc	ebx,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[6]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [20+esi]
+	adc	ebx,0
+	mov	edx,DWORD [8+esi]
+	; sqr a[5]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [16+esi]
+	adc	ebx,0
+	mov	edx,DWORD [12+esi]
+	; sqr a[4]*a[3]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [28+esi]
+	adc	ebx,0
+	mov	DWORD [28+edi],ecx
+	mov	edx,DWORD [4+esi]
+	; saved r[7]
+	; ############### Calculate word 8
+	xor	ecx,ecx
+	; sqr a[7]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [24+esi]
+	adc	ecx,0
+	mov	edx,DWORD [8+esi]
+	; sqr a[6]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [20+esi]
+	adc	ecx,0
+	mov	edx,DWORD [12+esi]
+	; sqr a[5]*a[3]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [16+esi]
+	adc	ecx,0
+	; sqr a[4]*a[4]
+	mul	eax
+	add	ebp,eax
+	adc	ebx,edx
+	mov	edx,DWORD [8+esi]
+	adc	ecx,0
+	mov	DWORD [32+edi],ebp
+	mov	eax,DWORD [28+esi]
+	; saved r[8]
+	; ############### Calculate word 9
+	xor	ebp,ebp
+	; sqr a[7]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [24+esi]
+	adc	ebp,0
+	mov	edx,DWORD [12+esi]
+	; sqr a[6]*a[3]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [20+esi]
+	adc	ebp,0
+	mov	edx,DWORD [16+esi]
+	; sqr a[5]*a[4]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [28+esi]
+	adc	ebp,0
+	mov	DWORD [36+edi],ebx
+	mov	edx,DWORD [12+esi]
+	; saved r[9]
+	; ############### Calculate word 10
+	xor	ebx,ebx
+	; sqr a[7]*a[3]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [24+esi]
+	adc	ebx,0
+	mov	edx,DWORD [16+esi]
+	; sqr a[6]*a[4]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [20+esi]
+	adc	ebx,0
+	; sqr a[5]*a[5]
+	mul	eax
+	add	ecx,eax
+	adc	ebp,edx
+	mov	edx,DWORD [16+esi]
+	adc	ebx,0
+	mov	DWORD [40+edi],ecx
+	mov	eax,DWORD [28+esi]
+	; saved r[10]
+	; ############### Calculate word 11
+	xor	ecx,ecx
+	; sqr a[7]*a[4]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [24+esi]
+	adc	ecx,0
+	mov	edx,DWORD [20+esi]
+	; sqr a[6]*a[5]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [28+esi]
+	adc	ecx,0
+	mov	DWORD [44+edi],ebp
+	mov	edx,DWORD [20+esi]
+	; saved r[11]
+	; ############### Calculate word 12
+	xor	ebp,ebp
+	; sqr a[7]*a[5]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [24+esi]
+	adc	ebp,0
+	; sqr a[6]*a[6]
+	mul	eax
+	add	ebx,eax
+	adc	ecx,edx
+	mov	edx,DWORD [24+esi]
+	adc	ebp,0
+	mov	DWORD [48+edi],ebx
+	mov	eax,DWORD [28+esi]
+	; saved r[12]
+	; ############### Calculate word 13
+	xor	ebx,ebx
+	; sqr a[7]*a[6]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [28+esi]
+	adc	ebx,0
+	mov	DWORD [52+edi],ecx
+	; saved r[13]
+	; ############### Calculate word 14
+	xor	ecx,ecx
+	; sqr a[7]*a[7]
+	mul	eax
+	add	ebp,eax
+	adc	ebx,edx
+	adc	ecx,0
+	mov	DWORD [56+edi],ebp
+	; saved r[14]
+	mov	DWORD [60+edi],ebx
+	pop	ebx
+	pop	ebp
+	pop	edi
+	pop	esi
+	ret
+global	_bn_sqr_comba4
+align	16
+_bn_sqr_comba4:
+L$_bn_sqr_comba4_begin:
+	push	esi
+	push	edi
+	push	ebp
+	push	ebx
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	xor	ebx,ebx
+	xor	ecx,ecx
+	mov	eax,DWORD [esi]
+	; ############### Calculate word 0
+	xor	ebp,ebp
+	; sqr a[0]*a[0]
+	mul	eax
+	add	ebx,eax
+	adc	ecx,edx
+	mov	edx,DWORD [esi]
+	adc	ebp,0
+	mov	DWORD [edi],ebx
+	mov	eax,DWORD [4+esi]
+	; saved r[0]
+	; ############### Calculate word 1
+	xor	ebx,ebx
+	; sqr a[1]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebx,0
+	mov	DWORD [4+edi],ecx
+	mov	edx,DWORD [esi]
+	; saved r[1]
+	; ############### Calculate word 2
+	xor	ecx,ecx
+	; sqr a[2]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [4+esi]
+	adc	ecx,0
+	; sqr a[1]*a[1]
+	mul	eax
+	add	ebp,eax
+	adc	ebx,edx
+	mov	edx,DWORD [esi]
+	adc	ecx,0
+	mov	DWORD [8+edi],ebp
+	mov	eax,DWORD [12+esi]
+	; saved r[2]
+	; ############### Calculate word 3
+	xor	ebp,ebp
+	; sqr a[3]*a[0]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebp,0
+	mov	edx,DWORD [4+esi]
+	; sqr a[2]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebp,0
+	add	ebx,eax
+	adc	ecx,edx
+	mov	eax,DWORD [12+esi]
+	adc	ebp,0
+	mov	DWORD [12+edi],ebx
+	mov	edx,DWORD [4+esi]
+	; saved r[3]
+	; ############### Calculate word 4
+	xor	ebx,ebx
+	; sqr a[3]*a[1]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ebx,0
+	add	ecx,eax
+	adc	ebp,edx
+	mov	eax,DWORD [8+esi]
+	adc	ebx,0
+	; sqr a[2]*a[2]
+	mul	eax
+	add	ecx,eax
+	adc	ebp,edx
+	mov	edx,DWORD [8+esi]
+	adc	ebx,0
+	mov	DWORD [16+edi],ecx
+	mov	eax,DWORD [12+esi]
+	; saved r[4]
+	; ############### Calculate word 5
+	xor	ecx,ecx
+	; sqr a[3]*a[2]
+	mul	edx
+	add	eax,eax
+	adc	edx,edx
+	adc	ecx,0
+	add	ebp,eax
+	adc	ebx,edx
+	mov	eax,DWORD [12+esi]
+	adc	ecx,0
+	mov	DWORD [20+edi],ebp
+	; saved r[5]
+	; ############### Calculate word 6
+	xor	ebp,ebp
+	; sqr a[3]*a[3]
+	mul	eax
+	add	ebx,eax
+	adc	ecx,edx
+	adc	ebp,0
+	mov	DWORD [24+edi],ebx
+	; saved r[6]
+	mov	DWORD [28+edi],ecx
+	pop	ebx
+	pop	ebp
+	pop	edi
+	pop	esi
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-armv4-linux.S b/gen/bcm/ghash-armv4-linux.S
new file mode 100644
index 0000000..7c04f89
--- /dev/null
+++ b/gen/bcm/ghash-armv4-linux.S
@@ -0,0 +1,244 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64	d7,[r1]!		@ load H
+	vmov.i8	q8,#0xe1
+	vld1.64	d6,[r1]
+	vshl.i64	d17,#57
+	vshr.u64	d16,#63		@ t0=0xc2....01
+	vdup.8	q9,d7[7]
+	vshr.u64	d26,d6,#63
+	vshr.s8	q9,#7			@ broadcast carry bit
+	vshl.i64	q3,q3,#1
+	vand	q8,q8,q9
+	vorr	d7,d26		@ H<<<=1
+	veor	q3,q3,q8		@ twisted H
+	vstmia	r0,{q3}
+
+	bx	lr					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64	d7,[r0]!		@ load Xi
+	vld1.64	d6,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+	mov	r3,#16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64	d1,[r0]!		@ load Xi
+	vld1.64	d0,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64	d7,[r2]!		@ load inp
+	vld1.64	d6,[r2]!
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	veor	q3,q0			@ inp^=Xi
+.Lgmult_neon:
+	vext.8	d16, d26, d26, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d0, d6, d6, #1	@ B1
+	vmull.p8	q0, d26, d0		@ E = A*B1
+	vext.8	d18, d26, d26, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d26, d22		@ G = A*B2
+	vext.8	d20, d26, d26, #3	@ A3
+	veor	q8, q8, q0		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d0, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q0, d26, d0		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d26, d22		@ K = A*B4
+	veor	q10, q10, q0		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q0, d26, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q0, q0, q8
+	veor	q0, q0, q10
+	veor	d6,d6,d7	@ Karatsuba pre-processing
+	vext.8	d16, d28, d28, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d2, d6, d6, #1	@ B1
+	vmull.p8	q1, d28, d2		@ E = A*B1
+	vext.8	d18, d28, d28, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d28, d22		@ G = A*B2
+	vext.8	d20, d28, d28, #3	@ A3
+	veor	q8, q8, q1		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d2, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q1, d28, d2		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d28, d22		@ K = A*B4
+	veor	q10, q10, q1		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q1, d28, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q1, q1, q8
+	veor	q1, q1, q10
+	vext.8	d16, d27, d27, #1	@ A1
+	vmull.p8	q8, d16, d7		@ F = A1*B
+	vext.8	d4, d7, d7, #1	@ B1
+	vmull.p8	q2, d27, d4		@ E = A*B1
+	vext.8	d18, d27, d27, #2	@ A2
+	vmull.p8	q9, d18, d7		@ H = A2*B
+	vext.8	d22, d7, d7, #2	@ B2
+	vmull.p8	q11, d27, d22		@ G = A*B2
+	vext.8	d20, d27, d27, #3	@ A3
+	veor	q8, q8, q2		@ L = E + F
+	vmull.p8	q10, d20, d7		@ J = A3*B
+	vext.8	d4, d7, d7, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q2, d27, d4		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d7, d7, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d27, d22		@ K = A*B4
+	veor	q10, q10, q2		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q2, d27, d7		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q2, q2, q8
+	veor	q2, q2, q10
+	veor	q1,q1,q0		@ Karatsuba post-processing
+	veor	q1,q1,q2
+	veor	d1,d1,d2
+	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	q9,q0,#57		@ 1st phase
+	vshl.i64	q10,q0,#62
+	veor	q10,q10,q9		@
+	vshl.i64	q9,q0,#63
+	veor	q10, q10, q9		@
+	veor	d1,d1,d20	@
+	veor	d4,d4,d21
+
+	vshr.u64	q10,q0,#1		@ 2nd phase
+	veor	q2,q2,q0
+	veor	q0,q0,q10		@
+	vshr.u64	q10,q10,#6
+	vshr.u64	q0,q0,#1		@
+	veor	q0,q0,q2		@
+	veor	q0,q0,q10		@
+
+	subs	r3,#16
+	bne	.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	sub	r0,#16
+	vst1.64	d1,[r0]!		@ write out Xi
+	vst1.64	d0,[r0]
+
+	bx	lr					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/ghash-neon-armv8-apple.S b/gen/bcm/ghash-neon-armv8-apple.S
new file mode 100644
index 0000000..a76b8d1
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-apple.S
@@ -0,0 +1,335 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+
+.align	4
+_gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+
+.align	4
+_gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+
+.align	4
+_gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	__TEXT,__const
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-neon-armv8-linux.S b/gen/bcm/ghash-neon-armv8-linux.S
new file mode 100644
index 0000000..6203bc6
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-linux.S
@@ -0,0 +1,335 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	.Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/ghash-neon-armv8-win.S b/gen/bcm/ghash-neon-armv8-win.S
new file mode 100644
index 0000000..d968893
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-win.S
@@ -0,0 +1,341 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	gcm_init_neon
+
+.def gcm_init_neon
+   .type 32
+.endef
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	gcm_gmult_neon
+
+.def gcm_gmult_neon
+   .type 32
+.endef
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	gcm_ghash_neon
+
+.def gcm_ghash_neon
+   .type 32
+.endef
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	.rodata
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/ghash-ssse3-x86-apple.S b/gen/bcm/ghash-ssse3-x86-apple.S
new file mode 100644
index 0000000..24b1f2f
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-apple.S
@@ -0,0 +1,288 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_gcm_gmult_ssse3
+.private_extern	_gcm_gmult_ssse3
+.align	4
+_gcm_gmult_ssse3:
+L_gcm_gmult_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movdqu	(%edi),%xmm0
+	call	L000pic_point
+L000pic_point:
+	popl	%eax
+	movdqa	Lreverse_bytes-L000pic_point(%eax),%xmm7
+	movdqa	Llow4_mask-L000pic_point(%eax),%xmm2
+.byte	102,15,56,0,199
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L001loop_row_1:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L001loop_row_1
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L002loop_row_2:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L002loop_row_2
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+L003loop_row_3:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L003loop_row_3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,0,215
+	movdqu	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_gcm_ghash_ssse3
+.private_extern	_gcm_ghash_ssse3
+.align	4
+_gcm_ghash_ssse3:
+L_gcm_ghash_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%edx
+	movl	32(%esp),%ecx
+	movdqu	(%edi),%xmm0
+	call	L004pic_point
+L004pic_point:
+	popl	%ebx
+	movdqa	Lreverse_bytes-L004pic_point(%ebx),%xmm7
+	andl	$-16,%ecx
+.byte	102,15,56,0,199
+	pxor	%xmm3,%xmm3
+L005loop_ghash:
+	movdqa	Llow4_mask-L004pic_point(%ebx),%xmm2
+	movdqu	(%edx),%xmm1
+.byte	102,15,56,0,207
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	movl	$5,%eax
+L006loop_row_4:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L006loop_row_4
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L007loop_row_5:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L007loop_row_5
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+L008loop_row_6:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L008loop_row_6
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+	leal	-256(%esi),%esi
+	leal	16(%edx),%edx
+	subl	$16,%ecx
+	jnz	L005loop_ghash
+.byte	102,15,56,0,199
+	movdqu	%xmm0,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4,0x90
+Lreverse_bytes:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align	4,0x90
+Llow4_mask:
+.long	252645135,252645135,252645135,252645135
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-ssse3-x86-linux.S b/gen/bcm/ghash-ssse3-x86-linux.S
new file mode 100644
index 0000000..445db3b
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-linux.S
@@ -0,0 +1,292 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	gcm_gmult_ssse3
+.hidden	gcm_gmult_ssse3
+.type	gcm_gmult_ssse3,@function
+.align	16
+gcm_gmult_ssse3:
+.L_gcm_gmult_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movdqu	(%edi),%xmm0
+	call	.L000pic_point
+.L000pic_point:
+	popl	%eax
+	movdqa	.Lreverse_bytes-.L000pic_point(%eax),%xmm7
+	movdqa	.Llow4_mask-.L000pic_point(%eax),%xmm2
+.byte	102,15,56,0,199
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L001loop_row_1:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L001loop_row_1
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L002loop_row_2:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L002loop_row_2
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+.L003loop_row_3:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L003loop_row_3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,0,215
+	movdqu	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin
+.globl	gcm_ghash_ssse3
+.hidden	gcm_ghash_ssse3
+.type	gcm_ghash_ssse3,@function
+.align	16
+gcm_ghash_ssse3:
+.L_gcm_ghash_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%edx
+	movl	32(%esp),%ecx
+	movdqu	(%edi),%xmm0
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebx
+	movdqa	.Lreverse_bytes-.L004pic_point(%ebx),%xmm7
+	andl	$-16,%ecx
+.byte	102,15,56,0,199
+	pxor	%xmm3,%xmm3
+.L005loop_ghash:
+	movdqa	.Llow4_mask-.L004pic_point(%ebx),%xmm2
+	movdqu	(%edx),%xmm1
+.byte	102,15,56,0,207
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	movl	$5,%eax
+.L006loop_row_4:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L006loop_row_4
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L007loop_row_5:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L007loop_row_5
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+.L008loop_row_6:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L008loop_row_6
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+	leal	-256(%esi),%esi
+	leal	16(%edx),%edx
+	subl	$16,%ecx
+	jnz	.L005loop_ghash
+.byte	102,15,56,0,199
+	movdqu	%xmm0,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin
+.align	16
+.Lreverse_bytes:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align	16
+.Llow4_mask:
+.long	252645135,252645135,252645135,252645135
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/ghash-ssse3-x86-win.asm b/gen/bcm/ghash-ssse3-x86-win.asm
new file mode 100644
index 0000000..52108aa
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-win.asm
@@ -0,0 +1,297 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_gcm_gmult_ssse3
+align	16
+_gcm_gmult_ssse3:
+L$_gcm_gmult_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	movdqu	xmm0,[edi]
+	call	L$000pic_point
+L$000pic_point:
+	pop	eax
+	movdqa	xmm7,[(L$reverse_bytes-L$000pic_point)+eax]
+	movdqa	xmm2,[(L$low4_mask-L$000pic_point)+eax]
+db	102,15,56,0,199
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	eax,5
+L$001loop_row_1:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$001loop_row_1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	eax,5
+L$002loop_row_2:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$002loop_row_2
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	eax,6
+L$003loop_row_3:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$003loop_row_3
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+db	102,15,56,0,215
+	movdqu	[edi],xmm2
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_gcm_ghash_ssse3
+align	16
+_gcm_ghash_ssse3:
+L$_gcm_ghash_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	edx,DWORD [28+esp]
+	mov	ecx,DWORD [32+esp]
+	movdqu	xmm0,[edi]
+	call	L$004pic_point
+L$004pic_point:
+	pop	ebx
+	movdqa	xmm7,[(L$reverse_bytes-L$004pic_point)+ebx]
+	and	ecx,-16
+db	102,15,56,0,199
+	pxor	xmm3,xmm3
+L$005loop_ghash:
+	movdqa	xmm2,[(L$low4_mask-L$004pic_point)+ebx]
+	movdqu	xmm1,[edx]
+db	102,15,56,0,207
+	pxor	xmm0,xmm1
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+	pxor	xmm2,xmm2
+	mov	eax,5
+L$006loop_row_4:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$006loop_row_4
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	eax,5
+L$007loop_row_5:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$007loop_row_5
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	eax,6
+L$008loop_row_6:
+	movdqa	xmm4,[esi]
+	lea	esi,[16+esi]
+	movdqa	xmm6,xmm2
+db	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+	movdqa	xmm5,xmm4
+db	102,15,56,0,224
+db	102,15,56,0,233
+	pxor	xmm2,xmm5
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+	sub	eax,1
+	jnz	NEAR L$008loop_row_6
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movdqa	xmm0,xmm2
+	lea	esi,[esi-256]
+	lea	edx,[16+edx]
+	sub	ecx,16
+	jnz	NEAR L$005loop_ghash
+db	102,15,56,0,199
+	movdqu	[edi],xmm0
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+L$reverse_bytes:
+db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+align	16
+L$low4_mask:
+dd	252645135,252645135,252645135,252645135
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-apple.S b/gen/bcm/ghash-ssse3-x86_64-apple.S
new file mode 100644
index 0000000..bcbf824
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-apple.S
@@ -0,0 +1,423 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+
+
+
+.globl	_gcm_gmult_ssse3
+.private_extern _gcm_gmult_ssse3
+.p2align	4
+_gcm_gmult_ssse3:
+
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movdqa	L$reverse_bytes(%rip),%xmm10
+	movdqa	L$low4_mask(%rip),%xmm2
+
+
+.byte	102,65,15,56,0,194
+
+
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+L$oop_row_1:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_1
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+L$oop_row_2:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_2
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+L$oop_row_3:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_3
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+
+.byte	102,65,15,56,0,210
+	movdqu	%xmm2,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	ret
+
+
+
+
+
+
+
+
+
+.globl	_gcm_ghash_ssse3
+.private_extern _gcm_ghash_ssse3
+.p2align	4
+_gcm_ghash_ssse3:
+
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movdqa	L$reverse_bytes(%rip),%xmm10
+	movdqa	L$low4_mask(%rip),%xmm11
+
+
+	andq	$-16,%rcx
+
+
+
+.byte	102,65,15,56,0,194
+
+
+	pxor	%xmm3,%xmm3
+L$oop_ghash:
+
+	movdqu	(%rdx),%xmm1
+.byte	102,65,15,56,0,202
+	pxor	%xmm1,%xmm0
+
+
+	movdqa	%xmm11,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm11,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+
+	movq	$5,%rax
+L$oop_row_4:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_4
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+L$oop_row_5:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_5
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+L$oop_row_6:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	L$oop_row_6
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+
+
+	leaq	-256(%rsi),%rsi
+
+
+	leaq	16(%rdx),%rdx
+	subq	$16,%rcx
+	jnz	L$oop_ghash
+
+
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	ret
+
+
+
+
+.section	__DATA,__const
+.p2align	4
+
+
+L$reverse_bytes:
+.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+L$low4_mask:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.text	
+#endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-linux.S b/gen/bcm/ghash-ssse3-x86_64-linux.S
new file mode 100644
index 0000000..2acb448
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-linux.S
@@ -0,0 +1,423 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+
+
+
+.type	gcm_gmult_ssse3, @function
+.globl	gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.align	16
+gcm_gmult_ssse3:
+.cfi_startproc	
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lreverse_bytes(%rip),%xmm10
+	movdqa	.Llow4_mask(%rip),%xmm2
+
+
+.byte	102,65,15,56,0,194
+
+
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_1:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_1
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_2:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_2
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+.Loop_row_3:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_3
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+
+.byte	102,65,15,56,0,210
+	movdqu	%xmm2,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	ret
+.cfi_endproc	
+
+.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
+
+
+
+
+
+.type	gcm_ghash_ssse3, @function
+.globl	gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.align	16
+gcm_ghash_ssse3:
+.cfi_startproc	
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lreverse_bytes(%rip),%xmm10
+	movdqa	.Llow4_mask(%rip),%xmm11
+
+
+	andq	$-16,%rcx
+
+
+
+.byte	102,65,15,56,0,194
+
+
+	pxor	%xmm3,%xmm3
+.Loop_ghash:
+
+	movdqu	(%rdx),%xmm1
+.byte	102,65,15,56,0,202
+	pxor	%xmm1,%xmm0
+
+
+	movdqa	%xmm11,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm11,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+
+	movq	$5,%rax
+.Loop_row_4:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_4
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_5:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_5
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+.Loop_row_6:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_6
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+
+
+	leaq	-256(%rsi),%rsi
+
+
+	leaq	16(%rdx),%rdx
+	subq	$16,%rcx
+	jnz	.Loop_ghash
+
+
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	ret
+.cfi_endproc	
+
+.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.section	.rodata
+.align	16
+
+
+.Lreverse_bytes:
+.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.Llow4_mask:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.text	
+#endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-win.asm b/gen/bcm/ghash-ssse3-x86_64-win.asm
new file mode 100644
index 0000000..84c5d40
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-win.asm
@@ -0,0 +1,497 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+
+
+
+
+global	gcm_gmult_ssse3
+ALIGN	16
+gcm_gmult_ssse3:
+
+$L$SEH_begin_gcm_gmult_ssse3_1:
+_CET_ENDBR
+	sub	rsp,40
+$L$SEH_prolog_gcm_gmult_ssse3_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_gmult_ssse3_3:
+	movdqa	XMMWORD[16+rsp],xmm10
+$L$SEH_prolog_gcm_gmult_ssse3_4:
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqa	xmm10,XMMWORD[$L$reverse_bytes]
+	movdqa	xmm2,XMMWORD[$L$low4_mask]
+
+
+DB	102,65,15,56,0,194
+
+
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+
+
+
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	rax,5
+$L$oop_row_1:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_1
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	rax,5
+$L$oop_row_2:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_2
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	rax,6
+$L$oop_row_3:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_3
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+
+DB	102,65,15,56,0,210
+	movdqu	XMMWORD[rcx],xmm2
+
+
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm10,XMMWORD[16+rsp]
+	add	rsp,40
+	ret
+
+$L$SEH_end_gcm_gmult_ssse3_5:
+
+
+
+
+
+
+
+global	gcm_ghash_ssse3
+ALIGN	16
+gcm_ghash_ssse3:
+
+$L$SEH_begin_gcm_ghash_ssse3_1:
+_CET_ENDBR
+	sub	rsp,56
+$L$SEH_prolog_gcm_ghash_ssse3_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_ghash_ssse3_3:
+	movdqa	XMMWORD[16+rsp],xmm10
+$L$SEH_prolog_gcm_ghash_ssse3_4:
+	movdqa	XMMWORD[32+rsp],xmm11
+$L$SEH_prolog_gcm_ghash_ssse3_5:
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqa	xmm10,XMMWORD[$L$reverse_bytes]
+	movdqa	xmm11,XMMWORD[$L$low4_mask]
+
+
+	and	r9,-16
+
+
+
+DB	102,65,15,56,0,194
+
+
+	pxor	xmm3,xmm3
+$L$oop_ghash:
+
+	movdqu	xmm1,XMMWORD[r8]
+DB	102,65,15,56,0,202
+	pxor	xmm0,xmm1
+
+
+	movdqa	xmm1,xmm11
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm11
+
+
+
+
+	pxor	xmm2,xmm2
+
+	mov	rax,5
+$L$oop_row_4:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_4
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	rax,5
+$L$oop_row_5:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_5
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	mov	rax,6
+$L$oop_row_6:
+	movdqa	xmm4,XMMWORD[rdx]
+	lea	rdx,[16+rdx]
+
+
+	movdqa	xmm6,xmm2
+DB	102,15,58,15,243,1
+	movdqa	xmm3,xmm6
+	psrldq	xmm2,1
+
+
+
+
+	movdqa	xmm5,xmm4
+DB	102,15,56,0,224
+DB	102,15,56,0,233
+
+
+	pxor	xmm2,xmm5
+
+
+
+	movdqa	xmm5,xmm4
+	psllq	xmm5,60
+	movdqa	xmm6,xmm5
+	pslldq	xmm6,8
+	pxor	xmm3,xmm6
+
+
+	psrldq	xmm5,8
+	pxor	xmm2,xmm5
+	psrlq	xmm4,4
+	pxor	xmm2,xmm4
+
+	sub	rax,1
+	jnz	NEAR $L$oop_row_6
+
+
+
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,1
+	pxor	xmm2,xmm3
+	psrlq	xmm3,5
+	pxor	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movdqa	xmm0,xmm2
+
+
+	lea	rdx,[((-256))+rdx]
+
+
+	lea	r8,[16+r8]
+	sub	r9,16
+	jnz	NEAR $L$oop_ghash
+
+
+DB	102,65,15,56,0,194
+	movdqu	XMMWORD[rcx],xmm0
+
+
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm10,XMMWORD[16+rsp]
+	movdqa	xmm11,XMMWORD[32+rsp]
+	add	rsp,56
+	ret
+
+$L$SEH_end_gcm_ghash_ssse3_6:
+
+
+section	.rdata rdata align=8
+ALIGN	16
+
+
+$L$reverse_bytes:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+
+$L$low4_mask:
+	DQ	0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+section	.text
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_gmult_ssse3_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_gmult_ssse3_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_gmult_ssse3_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_ssse3_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_ssse3_6 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_ssse3_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_gmult_ssse3_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_gmult_ssse3_4-$L$SEH_begin_gcm_gmult_ssse3_1
+	DB	5
+	DB	0
+	DB	$L$SEH_prolog_gcm_gmult_ssse3_4-$L$SEH_begin_gcm_gmult_ssse3_1
+	DB	168
+	DW	1
+	DB	$L$SEH_prolog_gcm_gmult_ssse3_3-$L$SEH_begin_gcm_gmult_ssse3_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_gmult_ssse3_2-$L$SEH_begin_gcm_gmult_ssse3_1
+	DB	66
+
+$L$SEH_info_gcm_ghash_ssse3_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_ghash_ssse3_5-$L$SEH_begin_gcm_ghash_ssse3_1
+	DB	7
+	DB	0
+	DB	$L$SEH_prolog_gcm_ghash_ssse3_5-$L$SEH_begin_gcm_ghash_ssse3_1
+	DB	184
+	DW	2
+	DB	$L$SEH_prolog_gcm_ghash_ssse3_4-$L$SEH_begin_gcm_ghash_ssse3_1
+	DB	168
+	DW	1
+	DB	$L$SEH_prolog_gcm_ghash_ssse3_3-$L$SEH_begin_gcm_ghash_ssse3_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_ghash_ssse3_2-$L$SEH_begin_gcm_ghash_ssse3_1
+	DB	98
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-x86-apple.S b/gen/bcm/ghash-x86-apple.S
new file mode 100644
index 0000000..a178b74
--- /dev/null
+++ b/gen/bcm/ghash-x86-apple.S
@@ -0,0 +1,322 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_gcm_init_clmul
+.private_extern	_gcm_init_clmul
+.align	4
+_gcm_init_clmul:
+L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	L000pic
+L000pic:
+	popl	%ecx
+	leal	Lbswap-L000pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
+	ret
+.globl	_gcm_gmult_clmul
+.private_extern	_gcm_gmult_clmul
+.align	4
+_gcm_gmult_clmul:
+L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	L001pic
+L001pic:
+	popl	%ecx
+	leal	Lbswap-L001pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.globl	_gcm_ghash_clmul
+.private_extern	_gcm_ghash_clmul
+.align	4
+_gcm_ghash_clmul:
+L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	L002pic
+L002pic:
+	popl	%ecx
+	leal	Lbswap-L002pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	L003odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm6,%xmm3
+	leal	32(%esi),%esi
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	nop
+	subl	$32,%ebx
+	jbe	L004even_tail
+	jmp	L005mod_loop
+.align	5,0x90
+L005mod_loop:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+	nop
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movups	(%edx),%xmm2
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
+	movdqu	16(%esi),%xmm6
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+.byte	102,15,56,0,245
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
+	pxor	%xmm4,%xmm1
+.byte	102,15,58,68,250,17
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,58,68,221,0
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	L005mod_loop
+L004even_tail:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testl	%ebx,%ebx
+	jnz	L006done
+	movups	(%edx),%xmm2
+L003odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+L006done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-x86-linux.S b/gen/bcm/ghash-x86-linux.S
new file mode 100644
index 0000000..c897efc
--- /dev/null
+++ b/gen/bcm/ghash-x86-linux.S
@@ -0,0 +1,328 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	gcm_init_clmul
+.hidden	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	.L000pic
+.L000pic:
+	popl	%ecx
+	leal	.Lbswap-.L000pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
+	ret
+.size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl	gcm_gmult_clmul
+.hidden	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	.L001pic
+.L001pic:
+	popl	%ecx
+	leal	.Lbswap-.L001pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.size	gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl	gcm_ghash_clmul
+.hidden	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	.L002pic
+.L002pic:
+	popl	%ecx
+	leal	.Lbswap-.L002pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	.L003odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm6,%xmm3
+	leal	32(%esi),%esi
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	nop
+	subl	$32,%ebx
+	jbe	.L004even_tail
+	jmp	.L005mod_loop
+.align	32
+.L005mod_loop:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+	nop
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movups	(%edx),%xmm2
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
+	movdqu	16(%esi),%xmm6
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+.byte	102,15,56,0,245
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
+	pxor	%xmm4,%xmm1
+.byte	102,15,58,68,250,17
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,58,68,221,0
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	.L005mod_loop
+.L004even_tail:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testl	%ebx,%ebx
+	jnz	.L006done
+	movups	(%edx),%xmm2
+.L003odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.L006done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align	64
+.Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/ghash-x86-win.asm b/gen/bcm/ghash-x86-win.asm
new file mode 100644
index 0000000..3f6c707
--- /dev/null
+++ b/gen/bcm/ghash-x86-win.asm
@@ -0,0 +1,330 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_gcm_init_clmul
+align	16
+_gcm_init_clmul:
+L$_gcm_init_clmul_begin:
+	mov	edx,DWORD [4+esp]
+	mov	eax,DWORD [8+esp]
+	call	L$000pic
+L$000pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$000pic)+ecx]
+	movdqu	xmm2,[eax]
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+	pand	xmm5,[16+ecx]
+	pxor	xmm2,xmm5
+	movdqa	xmm0,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	[edx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	[16+edx],xmm0
+db	102,15,58,15,227,8
+	movdqu	[32+edx],xmm4
+	ret
+global	_gcm_gmult_clmul
+align	16
+_gcm_gmult_clmul:
+L$_gcm_gmult_clmul_begin:
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [8+esp]
+	call	L$001pic
+L$001pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$001pic)+ecx]
+	movdqu	xmm0,[eax]
+	movdqa	xmm5,[ecx]
+	movups	xmm2,[edx]
+db	102,15,56,0,197
+	movups	xmm4,[32+edx]
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+db	102,15,56,0,197
+	movdqu	[eax],xmm0
+	ret
+global	_gcm_ghash_clmul
+align	16
+_gcm_ghash_clmul:
+L$_gcm_ghash_clmul_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	eax,DWORD [20+esp]
+	mov	edx,DWORD [24+esp]
+	mov	esi,DWORD [28+esp]
+	mov	ebx,DWORD [32+esp]
+	call	L$002pic
+L$002pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$002pic)+ecx]
+	movdqu	xmm0,[eax]
+	movdqa	xmm5,[ecx]
+	movdqu	xmm2,[edx]
+db	102,15,56,0,197
+	sub	ebx,16
+	jz	NEAR L$003odd_tail
+	movdqu	xmm3,[esi]
+	movdqu	xmm6,[16+esi]
+db	102,15,56,0,221
+db	102,15,56,0,245
+	movdqu	xmm5,[32+edx]
+	pxor	xmm0,xmm3
+	pshufd	xmm3,xmm6,78
+	movdqa	xmm7,xmm6
+	pxor	xmm3,xmm6
+	lea	esi,[32+esi]
+db	102,15,58,68,242,0
+db	102,15,58,68,250,17
+db	102,15,58,68,221,0
+	movups	xmm2,[16+edx]
+	nop
+	sub	ebx,32
+	jbe	NEAR L$004even_tail
+	jmp	NEAR L$005mod_loop
+align	32
+L$005mod_loop:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+	nop
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movups	xmm2,[edx]
+	xorps	xmm0,xmm6
+	movdqa	xmm5,[ecx]
+	xorps	xmm1,xmm7
+	movdqu	xmm7,[esi]
+	pxor	xmm3,xmm0
+	movdqu	xmm6,[16+esi]
+	pxor	xmm3,xmm1
+db	102,15,56,0,253
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+db	102,15,56,0,245
+	pxor	xmm1,xmm7
+	movdqa	xmm7,xmm6
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+db	102,15,58,68,242,0
+	movups	xmm5,[32+edx]
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	pshufd	xmm3,xmm7,78
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm3,xmm7
+	pxor	xmm1,xmm4
+db	102,15,58,68,250,17
+	movups	xmm2,[16+edx]
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+db	102,15,58,68,221,0
+	lea	esi,[32+esi]
+	sub	ebx,32
+	ja	NEAR L$005mod_loop
+L$004even_tail:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movdqa	xmm5,[ecx]
+	xorps	xmm0,xmm6
+	xorps	xmm1,xmm7
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	ebx,ebx
+	jnz	NEAR L$006done
+	movups	xmm2,[edx]
+L$003odd_tail:
+	movdqu	xmm3,[esi]
+db	102,15,56,0,221
+	pxor	xmm0,xmm3
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+L$006done:
+db	102,15,56,0,197
+	movdqu	[eax],xmm0
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$bswap:
+db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+db	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-x86_64-apple.S b/gen/bcm/ghash-x86_64-apple.S
new file mode 100644
index 0000000..909d659
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-apple.S
@@ -0,0 +1,1125 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+.globl	_gcm_init_clmul
+.private_extern _gcm_init_clmul
+
+.p2align	4
+_gcm_init_clmul:
+
+
+_CET_ENDBR
+L$_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	L$0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	pshufd	$78,%xmm2,%xmm6
+	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
+	ret
+
+
+
+.globl	_gcm_gmult_clmul
+.private_extern _gcm_gmult_clmul
+
+.p2align	4
+_gcm_gmult_clmul:
+
+_CET_ENDBR
+L$_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	L$bswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	ret
+
+
+.globl	_gcm_ghash_clmul
+.private_extern _gcm_ghash_clmul
+
+.p2align	5
+_gcm_ghash_clmul:
+
+
+_CET_ENDBR
+L$_ghash_clmul:
+	movdqa	L$bswap_mask(%rip),%xmm10
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
+
+	subq	$0x10,%rcx
+	jz	L$odd_tail
+
+	movdqu	16(%rsi),%xmm6
+	cmpq	$0x30,%rcx
+	jb	L$skip4x
+
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
+
+
+
+
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	L$tail4x
+
+	jmp	L$mod4_loop
+.p2align	5
+L$mod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
+
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	L$7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
+
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	L$mod4_loop
+
+L$tail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	L$done
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$odd_tail
+L$skip4x:
+
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	L$even_tail
+	nop
+	jmp	L$mod_loop
+
+.p2align	5
+L$mod_loop:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
+	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm3,%xmm5
+
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
+	leaq	32(%rdx),%rdx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
+	ja	L$mod_loop
+
+L$even_tail:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testq	%rcx,%rcx
+	jnz	L$done
+
+L$odd_tail:
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,223,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+L$done:
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+	ret
+
+
+
+.globl	_gcm_init_avx
+.private_extern _gcm_init_avx
+
+.p2align	5
+_gcm_init_avx:
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	L$init_start_avx
+.p2align	5
+L$init_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+L$init_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	L$init_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	ret
+
+
+
+.globl	_gcm_gmult_avx
+.private_extern _gcm_gmult_avx
+
+.p2align	5
+_gcm_gmult_avx:
+
+_CET_ENDBR
+	jmp	L$_gmult_clmul
+
+
+.globl	_gcm_ghash_avx
+.private_extern _gcm_ghash_avx
+
+.p2align	5
+_gcm_ghash_avx:
+
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	L$0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	L$bswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	L$short_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	L$tail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	L$oop8x_avx
+
+.p2align	5
+L$oop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	L$oop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	L$tail_no_xor_avx
+
+.p2align	5
+L$short_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	L$tail_avx
+
+.p2align	5
+L$tail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	L$short_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	ret
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long	7,0,7,0
+.p2align	6
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/gen/bcm/ghash-x86_64-linux.S b/gen/bcm/ghash-x86_64-linux.S
new file mode 100644
index 0000000..22429a6
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-linux.S
@@ -0,0 +1,1125 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.globl	gcm_init_clmul
+.hidden gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.cfi_startproc	
+
+_CET_ENDBR
+.L_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	pshufd	$78,%xmm2,%xmm6
+	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
+	ret
+.cfi_endproc	
+
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.cfi_startproc	
+_CET_ENDBR
+.L_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	ret
+.cfi_endproc	
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl	gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	32
+gcm_ghash_clmul:
+.cfi_startproc	
+
+_CET_ENDBR
+.L_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm10
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
+
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm6
+	cmpq	$0x30,%rcx
+	jb	.Lskip4x
+
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
+
+
+
+
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
+
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	.L7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
+
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	.Ldone
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+.Lskip4x:
+
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	.Leven_tail
+	nop
+	jmp	.Lmod_loop
+
+.align	32
+.Lmod_loop:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
+	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm3,%xmm5
+
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
+	leaq	32(%rdx),%rdx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,223,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.Ldone:
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+	ret
+.cfi_endproc	
+
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl	gcm_init_avx
+.hidden gcm_init_avx
+.type	gcm_init_avx,@function
+.align	32
+gcm_init_avx:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_init_avx,.-gcm_init_avx
+.globl	gcm_gmult_avx
+.hidden gcm_gmult_avx
+.type	gcm_gmult_avx,@function
+.align	32
+gcm_gmult_avx:
+.cfi_startproc	
+_CET_ENDBR
+	jmp	.L_gmult_clmul
+.cfi_endproc	
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+.globl	gcm_ghash_avx
+.hidden gcm_ghash_avx
+.type	gcm_ghash_avx,@function
+.align	32
+gcm_ghash_avx:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.align	64
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/gen/bcm/ghash-x86_64-win.asm b/gen/bcm/ghash-x86_64-win.asm
new file mode 100644
index 0000000..41b189a
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-win.asm
@@ -0,0 +1,1336 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+global	gcm_init_clmul
+
+ALIGN	16
+gcm_init_clmul:
+
+$L$SEH_begin_gcm_init_clmul_1:
+_CET_ENDBR
+$L$_init_clmul:
+	sub	rsp,0x18
+$L$SEH_prolog_gcm_init_clmul_2:
+	movaps	XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_init_clmul_3:
+	movdqu	xmm2,XMMWORD[rdx]
+	pshufd	xmm2,xmm2,78
+
+
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+
+
+	pand	xmm5,XMMWORD[$L$0x1c2_polynomial]
+	pxor	xmm2,xmm5
+
+
+	pshufd	xmm6,xmm2,78
+	movdqa	xmm0,xmm2
+	pxor	xmm6,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	XMMWORD[rcx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[16+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm5,xmm0
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm5,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm5
+	movdqu	XMMWORD[48+rcx],xmm5
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[64+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[80+rcx],xmm4
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+	ret
+
+$L$SEH_end_gcm_init_clmul_4:
+
+global	gcm_gmult_clmul
+
+ALIGN	16
+gcm_gmult_clmul:
+
+_CET_ENDBR
+$L$_gmult_clmul:
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqa	xmm5,XMMWORD[$L$bswap_mask]
+	movdqu	xmm2,XMMWORD[rdx]
+	movdqu	xmm4,XMMWORD[32+rdx]
+DB	102,15,56,0,197
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,220,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+DB	102,15,56,0,197
+	movdqu	XMMWORD[rcx],xmm0
+	ret
+
+
+global	gcm_ghash_clmul
+
+ALIGN	32
+gcm_ghash_clmul:
+
+$L$SEH_begin_gcm_ghash_clmul_1:
+_CET_ENDBR
+$L$_ghash_clmul:
+	lea	rax,[((-136))+rsp]
+	lea	rsp,[((-32))+rax]
+$L$SEH_prolog_gcm_ghash_clmul_2:
+	movaps	XMMWORD[(-32)+rax],xmm6
+$L$SEH_prolog_gcm_ghash_clmul_3:
+	movaps	XMMWORD[(-16)+rax],xmm7
+$L$SEH_prolog_gcm_ghash_clmul_4:
+	movaps	XMMWORD[rax],xmm8
+$L$SEH_prolog_gcm_ghash_clmul_5:
+	movaps	XMMWORD[16+rax],xmm9
+$L$SEH_prolog_gcm_ghash_clmul_6:
+	movaps	XMMWORD[32+rax],xmm10
+$L$SEH_prolog_gcm_ghash_clmul_7:
+	movaps	XMMWORD[48+rax],xmm11
+$L$SEH_prolog_gcm_ghash_clmul_8:
+	movaps	XMMWORD[64+rax],xmm12
+$L$SEH_prolog_gcm_ghash_clmul_9:
+	movaps	XMMWORD[80+rax],xmm13
+$L$SEH_prolog_gcm_ghash_clmul_10:
+	movaps	XMMWORD[96+rax],xmm14
+$L$SEH_prolog_gcm_ghash_clmul_11:
+	movaps	XMMWORD[112+rax],xmm15
+$L$SEH_prolog_gcm_ghash_clmul_12:
+	movdqa	xmm10,XMMWORD[$L$bswap_mask]
+
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[rdx]
+	movdqu	xmm7,XMMWORD[32+rdx]
+DB	102,65,15,56,0,194
+
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+
+	movdqu	xmm6,XMMWORD[16+rdx]
+	cmp	r9,0x30
+	jb	NEAR $L$skip4x
+
+	sub	r9,0x30
+	mov	rax,0xA040608020C0E000
+	movdqu	xmm14,XMMWORD[48+rdx]
+	movdqu	xmm15,XMMWORD[64+rdx]
+
+
+
+
+	movdqu	xmm3,XMMWORD[48+r8]
+	movdqu	xmm11,XMMWORD[32+r8]
+DB	102,65,15,56,0,218
+DB	102,69,15,56,0,218
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm12,xmm11
+DB	102,68,15,58,68,222,0
+DB	102,68,15,58,68,238,17
+DB	102,68,15,58,68,231,16
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+	xorps	xmm4,xmm12
+
+	movdqu	xmm11,XMMWORD[16+r8]
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,218
+DB	102,69,15,56,0,194
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	movdqa	xmm1,xmm0
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+DB	102,69,15,58,68,238,17
+DB	102,68,15,58,68,231,0
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jc	NEAR $L$tail4x
+
+	jmp	NEAR $L$mod4_loop
+ALIGN	32
+$L$mod4_loop:
+DB	102,65,15,58,68,199,0
+	xorps	xmm4,xmm12
+	movdqu	xmm11,XMMWORD[48+r8]
+DB	102,69,15,56,0,218
+DB	102,65,15,58,68,207,17
+	xorps	xmm0,xmm3
+	movdqu	xmm3,XMMWORD[32+r8]
+	movdqa	xmm13,xmm11
+DB	102,68,15,58,68,199,16
+	pshufd	xmm12,xmm11,78
+	xorps	xmm1,xmm5
+	pxor	xmm12,xmm11
+DB	102,65,15,56,0,218
+	movups	xmm7,XMMWORD[32+rdx]
+	xorps	xmm8,xmm4
+DB	102,68,15,58,68,218,0
+	pshufd	xmm4,xmm3,78
+
+	pxor	xmm8,xmm0
+	movdqa	xmm5,xmm3
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm9,xmm8
+DB	102,68,15,58,68,234,17
+	pslldq	xmm8,8
+	psrldq	xmm9,8
+	pxor	xmm0,xmm8
+	movdqa	xmm8,XMMWORD[$L$7_mask]
+	pxor	xmm1,xmm9
+DB	102,76,15,110,200
+
+	pand	xmm8,xmm0
+DB	102,69,15,56,0,200
+	pxor	xmm9,xmm0
+DB	102,68,15,58,68,231,0
+	psllq	xmm9,57
+	movdqa	xmm8,xmm9
+	pslldq	xmm9,8
+DB	102,15,58,68,222,0
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	movdqu	xmm8,XMMWORD[r8]
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,238,17
+	xorps	xmm3,xmm11
+	movdqu	xmm11,XMMWORD[16+r8]
+DB	102,69,15,56,0,218
+DB	102,15,58,68,231,16
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+DB	102,69,15,56,0,194
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+
+	movdqa	xmm13,xmm11
+	pxor	xmm4,xmm12
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm1,xmm0
+DB	102,69,15,58,68,238,17
+	xorps	xmm3,xmm11
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+
+DB	102,68,15,58,68,231,0
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jnc	NEAR $L$mod4_loop
+
+$L$tail4x:
+DB	102,65,15,58,68,199,0
+DB	102,65,15,58,68,207,17
+DB	102,68,15,58,68,199,16
+	xorps	xmm4,xmm12
+	xorps	xmm0,xmm3
+	xorps	xmm1,xmm5
+	pxor	xmm1,xmm0
+	pxor	xmm8,xmm4
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm0
+
+	movdqa	xmm9,xmm8
+	psrldq	xmm8,8
+	pslldq	xmm9,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm9
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	add	r9,0x40
+	jz	NEAR $L$done
+	movdqu	xmm7,XMMWORD[32+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+$L$skip4x:
+
+
+
+
+
+	movdqu	xmm8,XMMWORD[r8]
+	movdqu	xmm3,XMMWORD[16+r8]
+DB	102,69,15,56,0,194
+DB	102,65,15,56,0,218
+	pxor	xmm0,xmm8
+
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	lea	r8,[32+r8]
+	nop
+	sub	r9,0x20
+	jbe	NEAR $L$even_tail
+	nop
+	jmp	NEAR $L$mod_loop
+
+ALIGN	32
+$L$mod_loop:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	movdqu	xmm9,XMMWORD[r8]
+	pxor	xmm8,xmm0
+DB	102,69,15,56,0,202
+	movdqu	xmm3,XMMWORD[16+r8]
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm9
+	pxor	xmm4,xmm8
+DB	102,65,15,56,0,218
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm5,xmm3
+
+	movdqa	xmm9,xmm0
+	movdqa	xmm8,xmm0
+	psllq	xmm0,5
+	pxor	xmm8,xmm0
+DB	102,15,58,68,218,0
+	psllq	xmm0,1
+	pxor	xmm0,xmm8
+	psllq	xmm0,57
+	movdqa	xmm8,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pshufd	xmm4,xmm5,78
+	pxor	xmm1,xmm8
+	pxor	xmm4,xmm5
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,234,17
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm9
+	lea	r8,[32+r8]
+	psrlq	xmm0,1
+DB	102,15,58,68,231,0
+	pxor	xmm0,xmm1
+
+	sub	r9,0x20
+	ja	NEAR $L$mod_loop
+
+$L$even_tail:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	pxor	xmm8,xmm0
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm8
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	r9,r9
+	jnz	NEAR $L$done
+
+$L$odd_tail:
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,194
+	pxor	xmm0,xmm8
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,223,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+$L$done:
+DB	102,65,15,56,0,194
+	movdqu	XMMWORD[rcx],xmm0
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_gcm_ghash_clmul_13:
+
+global	gcm_init_avx
+
+ALIGN	32
+gcm_init_avx:
+
+_CET_ENDBR
+$L$SEH_begin_gcm_init_avx_1:
+	sub	rsp,0x18
+$L$SEH_prolog_gcm_init_avx_2:
+	movaps	XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_init_avx_3:
+	vzeroupper
+
+	vmovdqu	xmm2,XMMWORD[rdx]
+	vpshufd	xmm2,xmm2,78
+
+
+	vpshufd	xmm4,xmm2,255
+	vpsrlq	xmm3,xmm2,63
+	vpsllq	xmm2,xmm2,1
+	vpxor	xmm5,xmm5,xmm5
+	vpcmpgtd	xmm5,xmm5,xmm4
+	vpslldq	xmm3,xmm3,8
+	vpor	xmm2,xmm2,xmm3
+
+
+	vpand	xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
+	vpxor	xmm2,xmm2,xmm5
+
+	vpunpckhqdq	xmm6,xmm2,xmm2
+	vmovdqa	xmm0,xmm2
+	vpxor	xmm6,xmm6,xmm2
+	mov	r10,4
+	jmp	NEAR $L$init_start_avx
+ALIGN	32
+$L$init_loop_avx:
+	vpalignr	xmm5,xmm4,xmm3,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+$L$init_start_avx:
+	vmovdqa	xmm5,xmm0
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+	vpshufd	xmm3,xmm5,78
+	vpshufd	xmm4,xmm0,78
+	vpxor	xmm3,xmm3,xmm5
+	vmovdqu	XMMWORD[rcx],xmm5
+	vpxor	xmm4,xmm4,xmm0
+	vmovdqu	XMMWORD[16+rcx],xmm0
+	lea	rcx,[48+rcx]
+	sub	r10,1
+	jnz	NEAR $L$init_loop_avx
+
+	vpalignr	xmm5,xmm3,xmm4,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+	ret
+$L$SEH_end_gcm_init_avx_4:
+
+
+global	gcm_gmult_avx
+
+ALIGN	32
+gcm_gmult_avx:
+
+_CET_ENDBR
+	jmp	NEAR $L$_gmult_clmul
+
+
+global	gcm_ghash_avx
+
+ALIGN	32
+gcm_ghash_avx:
+
+_CET_ENDBR
+$L$SEH_begin_gcm_ghash_avx_1:
+	lea	rax,[((-136))+rsp]
+	lea	rsp,[((-32))+rax]
+$L$SEH_prolog_gcm_ghash_avx_2:
+	movaps	XMMWORD[(-32)+rax],xmm6
+$L$SEH_prolog_gcm_ghash_avx_3:
+	movaps	XMMWORD[(-16)+rax],xmm7
+$L$SEH_prolog_gcm_ghash_avx_4:
+	movaps	XMMWORD[rax],xmm8
+$L$SEH_prolog_gcm_ghash_avx_5:
+	movaps	XMMWORD[16+rax],xmm9
+$L$SEH_prolog_gcm_ghash_avx_6:
+	movaps	XMMWORD[32+rax],xmm10
+$L$SEH_prolog_gcm_ghash_avx_7:
+	movaps	XMMWORD[48+rax],xmm11
+$L$SEH_prolog_gcm_ghash_avx_8:
+	movaps	XMMWORD[64+rax],xmm12
+$L$SEH_prolog_gcm_ghash_avx_9:
+	movaps	XMMWORD[80+rax],xmm13
+$L$SEH_prolog_gcm_ghash_avx_10:
+	movaps	XMMWORD[96+rax],xmm14
+$L$SEH_prolog_gcm_ghash_avx_11:
+	movaps	XMMWORD[112+rax],xmm15
+$L$SEH_prolog_gcm_ghash_avx_12:
+	vzeroupper
+
+	vmovdqu	xmm10,XMMWORD[rcx]
+	lea	r10,[$L$0x1c2_polynomial]
+	lea	rdx,[64+rdx]
+	vmovdqu	xmm13,XMMWORD[$L$bswap_mask]
+	vpshufb	xmm10,xmm10,xmm13
+	cmp	r9,0x80
+	jb	NEAR $L$short_avx
+	sub	r9,0x80
+
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm9,xmm9,xmm14
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+
+	lea	r8,[128+r8]
+	cmp	r9,0x80
+	jb	NEAR $L$tail_avx
+
+	vpxor	xmm15,xmm15,xmm10
+	sub	r9,0x80
+	jmp	NEAR $L$oop8x_avx
+
+ALIGN	32
+$L$oop8x_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpxor	xmm8,xmm8,xmm15
+	vpclmulqdq	xmm10,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm11,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm12,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm10,xmm10,xmm3
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vxorps	xmm11,xmm11,xmm4
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm12,xmm12,xmm5
+	vxorps	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpxor	xmm12,xmm12,xmm10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm12,xmm12,xmm11
+	vpslldq	xmm9,xmm12,8
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vpsrldq	xmm12,xmm12,8
+	vpxor	xmm10,xmm10,xmm9
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vxorps	xmm11,xmm11,xmm12
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vxorps	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+	vxorps	xmm10,xmm10,xmm12
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vxorps	xmm12,xmm12,xmm11
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm15,xmm15,xmm12
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+	vpxor	xmm15,xmm15,xmm10
+
+	lea	r8,[128+r8]
+	sub	r9,0x80
+	jnc	NEAR $L$oop8x_avx
+
+	add	r9,0x80
+	jmp	NEAR $L$tail_no_xor_avx
+
+ALIGN	32
+$L$short_avx:
+	vmovdqu	xmm14,XMMWORD[((-16))+r9*1+r8]
+	lea	r8,[r9*1+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+
+	vmovdqa	xmm3,xmm0
+	vmovdqa	xmm4,xmm1
+	vmovdqa	xmm5,xmm2
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-32))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-48))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-64))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-80))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-96))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-112))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovq	xmm7,QWORD[((184-64))+rdx]
+	sub	r9,0x10
+	jmp	NEAR $L$tail_avx
+
+ALIGN	32
+$L$tail_avx:
+	vpxor	xmm15,xmm15,xmm10
+$L$tail_no_xor_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+
+	vmovdqu	xmm12,XMMWORD[r10]
+
+	vpxor	xmm10,xmm3,xmm0
+	vpxor	xmm11,xmm4,xmm1
+	vpxor	xmm5,xmm5,xmm2
+
+	vpxor	xmm5,xmm5,xmm10
+	vpxor	xmm5,xmm5,xmm11
+	vpslldq	xmm9,xmm5,8
+	vpsrldq	xmm5,xmm5,8
+	vpxor	xmm10,xmm10,xmm9
+	vpxor	xmm11,xmm11,xmm5
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm9
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm11
+	vpxor	xmm10,xmm10,xmm9
+
+	cmp	r9,0
+	jne	NEAR $L$short_avx
+
+	vpshufb	xmm10,xmm10,xmm13
+	vmovdqu	XMMWORD[rcx],xmm10
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_gcm_ghash_avx_13:
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$bswap_mask:
+	DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$0x1c2_polynomial:
+	DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$7_mask:
+	DD	7,0,7,0
+ALIGN	64
+
+	DB	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
+	DB	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+	DB	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+	DB	114,103,62,0
+ALIGN	64
+section	.text
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_clmul_4 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_clmul_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_clmul_13 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_init_avx_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_avx_4 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_avx_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_avx_13 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_init_clmul_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prolog_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1
+	DB	34
+
+$L$SEH_info_gcm_ghash_clmul_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	22
+	DB	0
+	DB	$L$SEH_prolog_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prolog_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prolog_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prolog_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prolog_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prolog_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prolog_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prolog_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prolog_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prolog_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1
+	DB	1
+	DW	21
+
+$L$SEH_info_gcm_init_avx_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prolog_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1
+	DB	34
+
+$L$SEH_info_gcm_ghash_avx_0:
+	DB	1
+	DB	$L$SEH_prolog_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1
+	DB	22
+	DB	0
+	DB	$L$SEH_prolog_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prolog_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prolog_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prolog_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prolog_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prolog_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prolog_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prolog_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prolog_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prolog_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prolog_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1
+	DB	1
+	DW	21
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghashv8-armv7-linux.S b/gen/bcm/ghashv8-armv7-linux.S
new file mode 100644
index 0000000..fab4c12
--- /dev/null
+++ b/gen/bcm/ghashv8-armv7-linux.S
@@ -0,0 +1,246 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.fpu	neon
+.code	32
+#undef	__thumb2__
+.globl	gcm_init_v8
+.hidden	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r1]		@ load input H
+	vmov.i8	q11,#0xe1
+	vshl.i64	q11,q11,#57		@ 0xc2.0
+	vext.8	q3,q9,q9,#8
+	vshr.u64	q10,q11,#63
+	vdup.32	q9,d18[1]
+	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
+	vshr.u64	q10,q3,#63
+	vshr.s32	q9,q9,#31		@ broadcast carry bit
+	vand	q10,q10,q8
+	vshl.i64	q3,q3,#1
+	vext.8	q10,q10,q10,#8
+	vand	q8,q8,q9
+	vorr	q3,q3,q10		@ H<<<=1
+	veor	q12,q3,q8		@ twisted H
+	vst1.64	{q12},[r0]!		@ store Htable[0]
+
+	@ calculate H^2
+	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
+.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
+	veor	q8,q8,q12
+.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
+.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q14,q0,q10
+
+	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
+	veor	q9,q9,q14
+	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
+	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
+	bx	lr
+.size	gcm_init_v8,.-gcm_init_v8
+.globl	gcm_gmult_v8
+.hidden	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r0]		@ load Xi
+	vmov.i8	q11,#0xe1
+	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
+	vshl.u64	q11,q11,#57
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q3,q9,q9,#8
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	bx	lr
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+.globl	gcm_ghash_v8
+.hidden	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	vld1.64	{q0},[r0]		@ load [rotated] Xi
+						@ "[rotated]" means that
+						@ loaded value would have
+						@ to be rotated in order to
+						@ make it appear as in
+						@ algorithm specification
+	subs	r3,r3,#32		@ see if r3 is 32 or larger
+	mov	r12,#16		@ r12 is used as post-
+						@ increment for input pointer;
+						@ as loop is modulo-scheduled
+						@ r12 is zeroed just in time
+						@ to preclude overstepping
+						@ inp[len], which means that
+						@ last block[s] are actually
+						@ loaded twice, but last
+						@ copy is not processed
+	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
+	vmov.i8	q11,#0xe1
+	vld1.64	{q14},[r1]
+	moveq	r12,#0			@ is it time to zero r12?
+	vext.8	q0,q0,q0,#8		@ rotate Xi
+	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
+	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+	vrev64.8	q0,q0
+#endif
+	vext.8	q3,q8,q8,#8		@ rotate I[0]
+	blo	.Lodd_tail_v8		@ r3 was less than 32
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q7,q9,q9,#8
+	veor	q3,q3,q0		@ I[i]^=Xi
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	b	.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8	q10,q3,q3,#8
+	subs	r3,r3,#32		@ is there more data?
+.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
+	movlo	r12,#0			@ is it time to zero r12?
+
+.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
+	veor	q10,q10,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
+	veor	q0,q0,q4		@ accumulate
+.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
+
+	veor	q2,q2,q6
+	moveq	r12,#0			@ is it time to zero r12?
+	veor	q1,q1,q5
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+#endif
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	vext.8	q7,q9,q9,#8
+	vext.8	q3,q8,q8,#8
+	veor	q0,q1,q10
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q3,q3,q2		@ accumulate q3 early
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q3,q3,q10
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+	veor	q3,q3,q0
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	bhs	.Loop_mod2x_v8		@ there was at least 32 more bytes
+
+	veor	q2,q2,q10
+	vext.8	q3,q8,q8,#8		@ re-construct q3
+	adds	r3,r3,#32		@ re-construct r3
+	veor	q0,q0,q2		@ re-construct q0
+	beq	.Ldone_v8		@ is r3 zero?
+.Lodd_tail_v8:
+	vext.8	q10,q0,q0,#8
+	veor	q3,q3,q0		@ inp^=Xi
+	veor	q9,q8,q10		@ q9 is rotated inp^Xi
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	bx	lr
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/ghashv8-armv8-apple.S b/gen/bcm/ghashv8-armv8-apple.S
new file mode 100644
index 0000000..6bc8a4f
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-apple.S
@@ -0,0 +1,565 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.globl	_gcm_init_v8
+.private_extern	_gcm_init_v8
+
+.align	4
+_gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	_gcm_gmult_v8
+.private_extern	_gcm_gmult_v8
+
+.align	4
+_gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	_gcm_ghash_v8
+.private_extern	_gcm_ghash_v8
+
+.align	4
+_gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/ghashv8-armv8-linux.S b/gen/bcm/ghashv8-armv8-linux.S
new file mode 100644
index 0000000..de6f712
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-linux.S
@@ -0,0 +1,565 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_v8
+.hidden	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+.globl	gcm_gmult_v8
+.hidden	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+.globl	gcm_ghash_v8
+.hidden	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	.Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	.Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	.Ldone_v8		//is x3 zero?
+.Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+.Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	.Ltail4x
+
+	b	.Loop4x
+
+.align	4
+.Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	.Loop4x
+
+.Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	.Ldone4x
+
+	cmp	x3,#32
+	b.lo	.Lone
+	b.eq	.Ltwo
+.Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/ghashv8-armv8-win.S b/gen/bcm/ghashv8-armv8-win.S
new file mode 100644
index 0000000..0be9ac6
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-win.S
@@ -0,0 +1,573 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_v8
+
+.def gcm_init_v8
+   .type 32
+.endef
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	gcm_gmult_v8
+
+.def gcm_gmult_v8
+   .type 32
+.endef
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	gcm_ghash_v8
+
+.def gcm_ghash_v8
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.def gcm_ghash_v8_4x
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/md5-586-apple.S b/gen/bcm/md5-586-apple.S
new file mode 100644
index 0000000..986d590
--- /dev/null
+++ b/gen/bcm/md5-586-apple.S
@@ -0,0 +1,684 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_md5_block_asm_data_order
+.private_extern	_md5_block_asm_data_order
+.align	4
+_md5_block_asm_data_order:
+L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+L000start:
+
+	# R0 section 
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+	# R0 0 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 1 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+	# R0 2 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 3 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 4 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 5 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+	# R0 6 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 7 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 8 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 9 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+	# R0 10 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 11 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 12 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 13 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+	# R0 14 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 15 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+	# R1 section 
+	# R1 16 
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 17 
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 18 
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 19 
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 20 
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 21 
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 22 
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 23 
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 24 
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 25 
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 26 
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 27 
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 28 
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 29 
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 30 
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 31 
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	# R2 section 
+	# R2 32 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 33 
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 34 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+	# R2 35 
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 36 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 37 
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 38 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+	# R2 39 
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 40 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 41 
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 42 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+	# R2 43 
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 44 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 45 
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 46 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+	# R2 47 
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	# R3 section 
+	# R3 48 
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 49 
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 50 
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 51 
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 52 
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 53 
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 54 
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 55 
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 56 
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 57 
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 58 
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 59 
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 60 
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 61 
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 62 
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 63 
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/md5-586-linux.S b/gen/bcm/md5-586-linux.S
new file mode 100644
index 0000000..a297f2b
--- /dev/null
+++ b/gen/bcm/md5-586-linux.S
@@ -0,0 +1,686 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	md5_block_asm_data_order
+.hidden	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+.align	16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+.L000start:
+
+
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+
+
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	.L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/md5-586-win.asm b/gen/bcm/md5-586-win.asm
new file mode 100644
index 0000000..25592b8
--- /dev/null
+++ b/gen/bcm/md5-586-win.asm
@@ -0,0 +1,694 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_md5_block_asm_data_order
+align	16
+_md5_block_asm_data_order:
+L$_md5_block_asm_data_order_begin:
+	push	esi
+	push	edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,DWORD [16+esp]
+	mov	ecx,DWORD [20+esp]
+	push	ebp
+	shl	ecx,6
+	push	ebx
+	add	ecx,esi
+	sub	ecx,64
+	mov	eax,DWORD [edi]
+	push	ecx
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+L$000start:
+	; 
+	; R0 section
+	mov	edi,ecx
+	mov	ebp,DWORD [esi]
+	; R0 0
+	xor	edi,edx
+	and	edi,ebx
+	lea	eax,[3614090360+ebp*1+eax]
+	xor	edi,edx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,7
+	mov	ebp,DWORD [4+esi]
+	add	eax,ebx
+	; R0 1
+	xor	edi,ecx
+	and	edi,eax
+	lea	edx,[3905402710+ebp*1+edx]
+	xor	edi,ecx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,12
+	mov	ebp,DWORD [8+esi]
+	add	edx,eax
+	; R0 2
+	xor	edi,ebx
+	and	edi,edx
+	lea	ecx,[606105819+ebp*1+ecx]
+	xor	edi,ebx
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,17
+	mov	ebp,DWORD [12+esi]
+	add	ecx,edx
+	; R0 3
+	xor	edi,eax
+	and	edi,ecx
+	lea	ebx,[3250441966+ebp*1+ebx]
+	xor	edi,eax
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,22
+	mov	ebp,DWORD [16+esi]
+	add	ebx,ecx
+	; R0 4
+	xor	edi,edx
+	and	edi,ebx
+	lea	eax,[4118548399+ebp*1+eax]
+	xor	edi,edx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,7
+	mov	ebp,DWORD [20+esi]
+	add	eax,ebx
+	; R0 5
+	xor	edi,ecx
+	and	edi,eax
+	lea	edx,[1200080426+ebp*1+edx]
+	xor	edi,ecx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,12
+	mov	ebp,DWORD [24+esi]
+	add	edx,eax
+	; R0 6
+	xor	edi,ebx
+	and	edi,edx
+	lea	ecx,[2821735955+ebp*1+ecx]
+	xor	edi,ebx
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,17
+	mov	ebp,DWORD [28+esi]
+	add	ecx,edx
+	; R0 7
+	xor	edi,eax
+	and	edi,ecx
+	lea	ebx,[4249261313+ebp*1+ebx]
+	xor	edi,eax
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,22
+	mov	ebp,DWORD [32+esi]
+	add	ebx,ecx
+	; R0 8
+	xor	edi,edx
+	and	edi,ebx
+	lea	eax,[1770035416+ebp*1+eax]
+	xor	edi,edx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,7
+	mov	ebp,DWORD [36+esi]
+	add	eax,ebx
+	; R0 9
+	xor	edi,ecx
+	and	edi,eax
+	lea	edx,[2336552879+ebp*1+edx]
+	xor	edi,ecx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,12
+	mov	ebp,DWORD [40+esi]
+	add	edx,eax
+	; R0 10
+	xor	edi,ebx
+	and	edi,edx
+	lea	ecx,[4294925233+ebp*1+ecx]
+	xor	edi,ebx
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,17
+	mov	ebp,DWORD [44+esi]
+	add	ecx,edx
+	; R0 11
+	xor	edi,eax
+	and	edi,ecx
+	lea	ebx,[2304563134+ebp*1+ebx]
+	xor	edi,eax
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,22
+	mov	ebp,DWORD [48+esi]
+	add	ebx,ecx
+	; R0 12
+	xor	edi,edx
+	and	edi,ebx
+	lea	eax,[1804603682+ebp*1+eax]
+	xor	edi,edx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,7
+	mov	ebp,DWORD [52+esi]
+	add	eax,ebx
+	; R0 13
+	xor	edi,ecx
+	and	edi,eax
+	lea	edx,[4254626195+ebp*1+edx]
+	xor	edi,ecx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,12
+	mov	ebp,DWORD [56+esi]
+	add	edx,eax
+	; R0 14
+	xor	edi,ebx
+	and	edi,edx
+	lea	ecx,[2792965006+ebp*1+ecx]
+	xor	edi,ebx
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,17
+	mov	ebp,DWORD [60+esi]
+	add	ecx,edx
+	; R0 15
+	xor	edi,eax
+	and	edi,ecx
+	lea	ebx,[1236535329+ebp*1+ebx]
+	xor	edi,eax
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,22
+	mov	ebp,DWORD [4+esi]
+	add	ebx,ecx
+	; 
+	; R1 section
+	; R1 16
+	lea	eax,[4129170786+ebp*1+eax]
+	xor	edi,ebx
+	and	edi,edx
+	mov	ebp,DWORD [24+esi]
+	xor	edi,ecx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,5
+	add	eax,ebx
+	; R1 17
+	lea	edx,[3225465664+ebp*1+edx]
+	xor	edi,eax
+	and	edi,ecx
+	mov	ebp,DWORD [44+esi]
+	xor	edi,ebx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,9
+	add	edx,eax
+	; R1 18
+	lea	ecx,[643717713+ebp*1+ecx]
+	xor	edi,edx
+	and	edi,ebx
+	mov	ebp,DWORD [esi]
+	xor	edi,eax
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,14
+	add	ecx,edx
+	; R1 19
+	lea	ebx,[3921069994+ebp*1+ebx]
+	xor	edi,ecx
+	and	edi,eax
+	mov	ebp,DWORD [20+esi]
+	xor	edi,edx
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,20
+	add	ebx,ecx
+	; R1 20
+	lea	eax,[3593408605+ebp*1+eax]
+	xor	edi,ebx
+	and	edi,edx
+	mov	ebp,DWORD [40+esi]
+	xor	edi,ecx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,5
+	add	eax,ebx
+	; R1 21
+	lea	edx,[38016083+ebp*1+edx]
+	xor	edi,eax
+	and	edi,ecx
+	mov	ebp,DWORD [60+esi]
+	xor	edi,ebx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,9
+	add	edx,eax
+	; R1 22
+	lea	ecx,[3634488961+ebp*1+ecx]
+	xor	edi,edx
+	and	edi,ebx
+	mov	ebp,DWORD [16+esi]
+	xor	edi,eax
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,14
+	add	ecx,edx
+	; R1 23
+	lea	ebx,[3889429448+ebp*1+ebx]
+	xor	edi,ecx
+	and	edi,eax
+	mov	ebp,DWORD [36+esi]
+	xor	edi,edx
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,20
+	add	ebx,ecx
+	; R1 24
+	lea	eax,[568446438+ebp*1+eax]
+	xor	edi,ebx
+	and	edi,edx
+	mov	ebp,DWORD [56+esi]
+	xor	edi,ecx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,5
+	add	eax,ebx
+	; R1 25
+	lea	edx,[3275163606+ebp*1+edx]
+	xor	edi,eax
+	and	edi,ecx
+	mov	ebp,DWORD [12+esi]
+	xor	edi,ebx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,9
+	add	edx,eax
+	; R1 26
+	lea	ecx,[4107603335+ebp*1+ecx]
+	xor	edi,edx
+	and	edi,ebx
+	mov	ebp,DWORD [32+esi]
+	xor	edi,eax
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,14
+	add	ecx,edx
+	; R1 27
+	lea	ebx,[1163531501+ebp*1+ebx]
+	xor	edi,ecx
+	and	edi,eax
+	mov	ebp,DWORD [52+esi]
+	xor	edi,edx
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,20
+	add	ebx,ecx
+	; R1 28
+	lea	eax,[2850285829+ebp*1+eax]
+	xor	edi,ebx
+	and	edi,edx
+	mov	ebp,DWORD [8+esi]
+	xor	edi,ecx
+	add	eax,edi
+	mov	edi,ebx
+	rol	eax,5
+	add	eax,ebx
+	; R1 29
+	lea	edx,[4243563512+ebp*1+edx]
+	xor	edi,eax
+	and	edi,ecx
+	mov	ebp,DWORD [28+esi]
+	xor	edi,ebx
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,9
+	add	edx,eax
+	; R1 30
+	lea	ecx,[1735328473+ebp*1+ecx]
+	xor	edi,edx
+	and	edi,ebx
+	mov	ebp,DWORD [48+esi]
+	xor	edi,eax
+	add	ecx,edi
+	mov	edi,edx
+	rol	ecx,14
+	add	ecx,edx
+	; R1 31
+	lea	ebx,[2368359562+ebp*1+ebx]
+	xor	edi,ecx
+	and	edi,eax
+	mov	ebp,DWORD [20+esi]
+	xor	edi,edx
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,20
+	add	ebx,ecx
+	; 
+	; R2 section
+	; R2 32
+	xor	edi,edx
+	xor	edi,ebx
+	lea	eax,[4294588738+ebp*1+eax]
+	add	eax,edi
+	rol	eax,4
+	mov	ebp,DWORD [32+esi]
+	mov	edi,ebx
+	; R2 33
+	lea	edx,[2272392833+ebp*1+edx]
+	add	eax,ebx
+	xor	edi,ecx
+	xor	edi,eax
+	mov	ebp,DWORD [44+esi]
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,11
+	add	edx,eax
+	; R2 34
+	xor	edi,ebx
+	xor	edi,edx
+	lea	ecx,[1839030562+ebp*1+ecx]
+	add	ecx,edi
+	rol	ecx,16
+	mov	ebp,DWORD [56+esi]
+	mov	edi,edx
+	; R2 35
+	lea	ebx,[4259657740+ebp*1+ebx]
+	add	ecx,edx
+	xor	edi,eax
+	xor	edi,ecx
+	mov	ebp,DWORD [4+esi]
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,23
+	add	ebx,ecx
+	; R2 36
+	xor	edi,edx
+	xor	edi,ebx
+	lea	eax,[2763975236+ebp*1+eax]
+	add	eax,edi
+	rol	eax,4
+	mov	ebp,DWORD [16+esi]
+	mov	edi,ebx
+	; R2 37
+	lea	edx,[1272893353+ebp*1+edx]
+	add	eax,ebx
+	xor	edi,ecx
+	xor	edi,eax
+	mov	ebp,DWORD [28+esi]
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,11
+	add	edx,eax
+	; R2 38
+	xor	edi,ebx
+	xor	edi,edx
+	lea	ecx,[4139469664+ebp*1+ecx]
+	add	ecx,edi
+	rol	ecx,16
+	mov	ebp,DWORD [40+esi]
+	mov	edi,edx
+	; R2 39
+	lea	ebx,[3200236656+ebp*1+ebx]
+	add	ecx,edx
+	xor	edi,eax
+	xor	edi,ecx
+	mov	ebp,DWORD [52+esi]
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,23
+	add	ebx,ecx
+	; R2 40
+	xor	edi,edx
+	xor	edi,ebx
+	lea	eax,[681279174+ebp*1+eax]
+	add	eax,edi
+	rol	eax,4
+	mov	ebp,DWORD [esi]
+	mov	edi,ebx
+	; R2 41
+	lea	edx,[3936430074+ebp*1+edx]
+	add	eax,ebx
+	xor	edi,ecx
+	xor	edi,eax
+	mov	ebp,DWORD [12+esi]
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,11
+	add	edx,eax
+	; R2 42
+	xor	edi,ebx
+	xor	edi,edx
+	lea	ecx,[3572445317+ebp*1+ecx]
+	add	ecx,edi
+	rol	ecx,16
+	mov	ebp,DWORD [24+esi]
+	mov	edi,edx
+	; R2 43
+	lea	ebx,[76029189+ebp*1+ebx]
+	add	ecx,edx
+	xor	edi,eax
+	xor	edi,ecx
+	mov	ebp,DWORD [36+esi]
+	add	ebx,edi
+	mov	edi,ecx
+	rol	ebx,23
+	add	ebx,ecx
+	; R2 44
+	xor	edi,edx
+	xor	edi,ebx
+	lea	eax,[3654602809+ebp*1+eax]
+	add	eax,edi
+	rol	eax,4
+	mov	ebp,DWORD [48+esi]
+	mov	edi,ebx
+	; R2 45
+	lea	edx,[3873151461+ebp*1+edx]
+	add	eax,ebx
+	xor	edi,ecx
+	xor	edi,eax
+	mov	ebp,DWORD [60+esi]
+	add	edx,edi
+	mov	edi,eax
+	rol	edx,11
+	add	edx,eax
+	; R2 46
+	xor	edi,ebx
+	xor	edi,edx
+	lea	ecx,[530742520+ebp*1+ecx]
+	add	ecx,edi
+	rol	ecx,16
+	mov	ebp,DWORD [8+esi]
+	mov	edi,edx
+	; R2 47
+	lea	ebx,[3299628645+ebp*1+ebx]
+	add	ecx,edx
+	xor	edi,eax
+	xor	edi,ecx
+	mov	ebp,DWORD [esi]
+	add	ebx,edi
+	mov	edi,-1
+	rol	ebx,23
+	add	ebx,ecx
+	; 
+	; R3 section
+	; R3 48
+	xor	edi,edx
+	or	edi,ebx
+	lea	eax,[4096336452+ebp*1+eax]
+	xor	edi,ecx
+	mov	ebp,DWORD [28+esi]
+	add	eax,edi
+	mov	edi,-1
+	rol	eax,6
+	xor	edi,ecx
+	add	eax,ebx
+	; R3 49
+	or	edi,eax
+	lea	edx,[1126891415+ebp*1+edx]
+	xor	edi,ebx
+	mov	ebp,DWORD [56+esi]
+	add	edx,edi
+	mov	edi,-1
+	rol	edx,10
+	xor	edi,ebx
+	add	edx,eax
+	; R3 50
+	or	edi,edx
+	lea	ecx,[2878612391+ebp*1+ecx]
+	xor	edi,eax
+	mov	ebp,DWORD [20+esi]
+	add	ecx,edi
+	mov	edi,-1
+	rol	ecx,15
+	xor	edi,eax
+	add	ecx,edx
+	; R3 51
+	or	edi,ecx
+	lea	ebx,[4237533241+ebp*1+ebx]
+	xor	edi,edx
+	mov	ebp,DWORD [48+esi]
+	add	ebx,edi
+	mov	edi,-1
+	rol	ebx,21
+	xor	edi,edx
+	add	ebx,ecx
+	; R3 52
+	or	edi,ebx
+	lea	eax,[1700485571+ebp*1+eax]
+	xor	edi,ecx
+	mov	ebp,DWORD [12+esi]
+	add	eax,edi
+	mov	edi,-1
+	rol	eax,6
+	xor	edi,ecx
+	add	eax,ebx
+	; R3 53
+	or	edi,eax
+	lea	edx,[2399980690+ebp*1+edx]
+	xor	edi,ebx
+	mov	ebp,DWORD [40+esi]
+	add	edx,edi
+	mov	edi,-1
+	rol	edx,10
+	xor	edi,ebx
+	add	edx,eax
+	; R3 54
+	or	edi,edx
+	lea	ecx,[4293915773+ebp*1+ecx]
+	xor	edi,eax
+	mov	ebp,DWORD [4+esi]
+	add	ecx,edi
+	mov	edi,-1
+	rol	ecx,15
+	xor	edi,eax
+	add	ecx,edx
+	; R3 55
+	or	edi,ecx
+	lea	ebx,[2240044497+ebp*1+ebx]
+	xor	edi,edx
+	mov	ebp,DWORD [32+esi]
+	add	ebx,edi
+	mov	edi,-1
+	rol	ebx,21
+	xor	edi,edx
+	add	ebx,ecx
+	; R3 56
+	or	edi,ebx
+	lea	eax,[1873313359+ebp*1+eax]
+	xor	edi,ecx
+	mov	ebp,DWORD [60+esi]
+	add	eax,edi
+	mov	edi,-1
+	rol	eax,6
+	xor	edi,ecx
+	add	eax,ebx
+	; R3 57
+	or	edi,eax
+	lea	edx,[4264355552+ebp*1+edx]
+	xor	edi,ebx
+	mov	ebp,DWORD [24+esi]
+	add	edx,edi
+	mov	edi,-1
+	rol	edx,10
+	xor	edi,ebx
+	add	edx,eax
+	; R3 58
+	or	edi,edx
+	lea	ecx,[2734768916+ebp*1+ecx]
+	xor	edi,eax
+	mov	ebp,DWORD [52+esi]
+	add	ecx,edi
+	mov	edi,-1
+	rol	ecx,15
+	xor	edi,eax
+	add	ecx,edx
+	; R3 59
+	or	edi,ecx
+	lea	ebx,[1309151649+ebp*1+ebx]
+	xor	edi,edx
+	mov	ebp,DWORD [16+esi]
+	add	ebx,edi
+	mov	edi,-1
+	rol	ebx,21
+	xor	edi,edx
+	add	ebx,ecx
+	; R3 60
+	or	edi,ebx
+	lea	eax,[4149444226+ebp*1+eax]
+	xor	edi,ecx
+	mov	ebp,DWORD [44+esi]
+	add	eax,edi
+	mov	edi,-1
+	rol	eax,6
+	xor	edi,ecx
+	add	eax,ebx
+	; R3 61
+	or	edi,eax
+	lea	edx,[3174756917+ebp*1+edx]
+	xor	edi,ebx
+	mov	ebp,DWORD [8+esi]
+	add	edx,edi
+	mov	edi,-1
+	rol	edx,10
+	xor	edi,ebx
+	add	edx,eax
+	; R3 62
+	or	edi,edx
+	lea	ecx,[718787259+ebp*1+ecx]
+	xor	edi,eax
+	mov	ebp,DWORD [36+esi]
+	add	ecx,edi
+	mov	edi,-1
+	rol	ecx,15
+	xor	edi,eax
+	add	ecx,edx
+	; R3 63
+	or	edi,ecx
+	lea	ebx,[3951481745+ebp*1+ebx]
+	xor	edi,edx
+	mov	ebp,DWORD [24+esp]
+	add	ebx,edi
+	add	esi,64
+	rol	ebx,21
+	mov	edi,DWORD [ebp]
+	add	ebx,ecx
+	add	eax,edi
+	mov	edi,DWORD [4+ebp]
+	add	ebx,edi
+	mov	edi,DWORD [8+ebp]
+	add	ecx,edi
+	mov	edi,DWORD [12+ebp]
+	add	edx,edi
+	mov	DWORD [ebp],eax
+	mov	DWORD [4+ebp],ebx
+	mov	edi,DWORD [esp]
+	mov	DWORD [8+ebp],ecx
+	mov	DWORD [12+ebp],edx
+	cmp	edi,esi
+	jae	NEAR L$000start
+	pop	eax
+	pop	ebx
+	pop	ebp
+	pop	edi
+	pop	esi
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/md5-x86_64-apple.S b/gen/bcm/md5-x86_64-apple.S
new file mode 100644
index 0000000..e4c0241
--- /dev/null
+++ b/gen/bcm/md5-x86_64-apple.S
@@ -0,0 +1,690 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+.p2align	4
+
+.globl	_md5_block_asm_data_order
+.private_extern _md5_block_asm_data_order
+
+_md5_block_asm_data_order:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$prologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	L$end
+
+
+L$loop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	L$loop
+
+
+L$end:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r12
+
+	movq	24(%rsp),%rbx
+
+	movq	32(%rsp),%rbp
+
+	addq	$40,%rsp
+
+L$epilogue:
+	ret
+
+
+#endif
diff --git a/gen/bcm/md5-x86_64-linux.S b/gen/bcm/md5-x86_64-linux.S
new file mode 100644
index 0000000..7b93662
--- /dev/null
+++ b/gen/bcm/md5-x86_64-linux.S
@@ -0,0 +1,695 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.align	16
+
+.globl	md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-32
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-40
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-48
+.Lprologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	.Lend
+
+
+.Lloop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	.Lloop
+
+
+.Lend:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+.cfi_restore	r15
+	movq	8(%rsp),%r14
+.cfi_restore	r14
+	movq	16(%rsp),%r12
+.cfi_restore	r12
+	movq	24(%rsp),%rbx
+.cfi_restore	rbx
+	movq	32(%rsp),%rbp
+.cfi_restore	rbp
+	addq	$40,%rsp
+.cfi_adjust_cfa_offset	-40
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	md5_block_asm_data_order,.-md5_block_asm_data_order
+#endif
diff --git a/gen/bcm/md5-x86_64-win.asm b/gen/bcm/md5-x86_64-win.asm
new file mode 100644
index 0000000..f6c5b62
--- /dev/null
+++ b/gen/bcm/md5-x86_64-win.asm
@@ -0,0 +1,803 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+ALIGN	16
+
+global	md5_block_asm_data_order
+
+md5_block_asm_data_order:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_md5_block_asm_data_order:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r14
+
+	push	r15
+
+$L$prologue:
+
+
+
+
+	mov	rbp,rdi
+	shl	rdx,6
+	lea	rdi,[rdx*1+rsi]
+	mov	eax,DWORD[rbp]
+	mov	ebx,DWORD[4+rbp]
+	mov	ecx,DWORD[8+rbp]
+	mov	edx,DWORD[12+rbp]
+
+
+
+
+
+
+
+	cmp	rsi,rdi
+	je	NEAR $L$end
+
+
+$L$loop:
+	mov	r8d,eax
+	mov	r9d,ebx
+	mov	r14d,ecx
+	mov	r15d,edx
+	mov	r10d,DWORD[rsi]
+	mov	r11d,edx
+	xor	r11d,ecx
+	lea	eax,[((-680876936))+r10*1+rax]
+	and	r11d,ebx
+	xor	r11d,edx
+	mov	r10d,DWORD[4+rsi]
+	add	eax,r11d
+	rol	eax,7
+	mov	r11d,ecx
+	add	eax,ebx
+	xor	r11d,ebx
+	lea	edx,[((-389564586))+r10*1+rdx]
+	and	r11d,eax
+	xor	r11d,ecx
+	mov	r10d,DWORD[8+rsi]
+	add	edx,r11d
+	rol	edx,12
+	mov	r11d,ebx
+	add	edx,eax
+	xor	r11d,eax
+	lea	ecx,[606105819+r10*1+rcx]
+	and	r11d,edx
+	xor	r11d,ebx
+	mov	r10d,DWORD[12+rsi]
+	add	ecx,r11d
+	rol	ecx,17
+	mov	r11d,eax
+	add	ecx,edx
+	xor	r11d,edx
+	lea	ebx,[((-1044525330))+r10*1+rbx]
+	and	r11d,ecx
+	xor	r11d,eax
+	mov	r10d,DWORD[16+rsi]
+	add	ebx,r11d
+	rol	ebx,22
+	mov	r11d,edx
+	add	ebx,ecx
+	xor	r11d,ecx
+	lea	eax,[((-176418897))+r10*1+rax]
+	and	r11d,ebx
+	xor	r11d,edx
+	mov	r10d,DWORD[20+rsi]
+	add	eax,r11d
+	rol	eax,7
+	mov	r11d,ecx
+	add	eax,ebx
+	xor	r11d,ebx
+	lea	edx,[1200080426+r10*1+rdx]
+	and	r11d,eax
+	xor	r11d,ecx
+	mov	r10d,DWORD[24+rsi]
+	add	edx,r11d
+	rol	edx,12
+	mov	r11d,ebx
+	add	edx,eax
+	xor	r11d,eax
+	lea	ecx,[((-1473231341))+r10*1+rcx]
+	and	r11d,edx
+	xor	r11d,ebx
+	mov	r10d,DWORD[28+rsi]
+	add	ecx,r11d
+	rol	ecx,17
+	mov	r11d,eax
+	add	ecx,edx
+	xor	r11d,edx
+	lea	ebx,[((-45705983))+r10*1+rbx]
+	and	r11d,ecx
+	xor	r11d,eax
+	mov	r10d,DWORD[32+rsi]
+	add	ebx,r11d
+	rol	ebx,22
+	mov	r11d,edx
+	add	ebx,ecx
+	xor	r11d,ecx
+	lea	eax,[1770035416+r10*1+rax]
+	and	r11d,ebx
+	xor	r11d,edx
+	mov	r10d,DWORD[36+rsi]
+	add	eax,r11d
+	rol	eax,7
+	mov	r11d,ecx
+	add	eax,ebx
+	xor	r11d,ebx
+	lea	edx,[((-1958414417))+r10*1+rdx]
+	and	r11d,eax
+	xor	r11d,ecx
+	mov	r10d,DWORD[40+rsi]
+	add	edx,r11d
+	rol	edx,12
+	mov	r11d,ebx
+	add	edx,eax
+	xor	r11d,eax
+	lea	ecx,[((-42063))+r10*1+rcx]
+	and	r11d,edx
+	xor	r11d,ebx
+	mov	r10d,DWORD[44+rsi]
+	add	ecx,r11d
+	rol	ecx,17
+	mov	r11d,eax
+	add	ecx,edx
+	xor	r11d,edx
+	lea	ebx,[((-1990404162))+r10*1+rbx]
+	and	r11d,ecx
+	xor	r11d,eax
+	mov	r10d,DWORD[48+rsi]
+	add	ebx,r11d
+	rol	ebx,22
+	mov	r11d,edx
+	add	ebx,ecx
+	xor	r11d,ecx
+	lea	eax,[1804603682+r10*1+rax]
+	and	r11d,ebx
+	xor	r11d,edx
+	mov	r10d,DWORD[52+rsi]
+	add	eax,r11d
+	rol	eax,7
+	mov	r11d,ecx
+	add	eax,ebx
+	xor	r11d,ebx
+	lea	edx,[((-40341101))+r10*1+rdx]
+	and	r11d,eax
+	xor	r11d,ecx
+	mov	r10d,DWORD[56+rsi]
+	add	edx,r11d
+	rol	edx,12
+	mov	r11d,ebx
+	add	edx,eax
+	xor	r11d,eax
+	lea	ecx,[((-1502002290))+r10*1+rcx]
+	and	r11d,edx
+	xor	r11d,ebx
+	mov	r10d,DWORD[60+rsi]
+	add	ecx,r11d
+	rol	ecx,17
+	mov	r11d,eax
+	add	ecx,edx
+	xor	r11d,edx
+	lea	ebx,[1236535329+r10*1+rbx]
+	and	r11d,ecx
+	xor	r11d,eax
+	mov	r10d,DWORD[rsi]
+	add	ebx,r11d
+	rol	ebx,22
+	mov	r11d,edx
+	add	ebx,ecx
+	mov	r10d,DWORD[4+rsi]
+	mov	r11d,edx
+	mov	r12d,edx
+	not	r11d
+	lea	eax,[((-165796510))+r10*1+rax]
+	and	r12d,ebx
+	and	r11d,ecx
+	mov	r10d,DWORD[24+rsi]
+	or	r12d,r11d
+	mov	r11d,ecx
+	add	eax,r12d
+	mov	r12d,ecx
+	rol	eax,5
+	add	eax,ebx
+	not	r11d
+	lea	edx,[((-1069501632))+r10*1+rdx]
+	and	r12d,eax
+	and	r11d,ebx
+	mov	r10d,DWORD[44+rsi]
+	or	r12d,r11d
+	mov	r11d,ebx
+	add	edx,r12d
+	mov	r12d,ebx
+	rol	edx,9
+	add	edx,eax
+	not	r11d
+	lea	ecx,[643717713+r10*1+rcx]
+	and	r12d,edx
+	and	r11d,eax
+	mov	r10d,DWORD[rsi]
+	or	r12d,r11d
+	mov	r11d,eax
+	add	ecx,r12d
+	mov	r12d,eax
+	rol	ecx,14
+	add	ecx,edx
+	not	r11d
+	lea	ebx,[((-373897302))+r10*1+rbx]
+	and	r12d,ecx
+	and	r11d,edx
+	mov	r10d,DWORD[20+rsi]
+	or	r12d,r11d
+	mov	r11d,edx
+	add	ebx,r12d
+	mov	r12d,edx
+	rol	ebx,20
+	add	ebx,ecx
+	not	r11d
+	lea	eax,[((-701558691))+r10*1+rax]
+	and	r12d,ebx
+	and	r11d,ecx
+	mov	r10d,DWORD[40+rsi]
+	or	r12d,r11d
+	mov	r11d,ecx
+	add	eax,r12d
+	mov	r12d,ecx
+	rol	eax,5
+	add	eax,ebx
+	not	r11d
+	lea	edx,[38016083+r10*1+rdx]
+	and	r12d,eax
+	and	r11d,ebx
+	mov	r10d,DWORD[60+rsi]
+	or	r12d,r11d
+	mov	r11d,ebx
+	add	edx,r12d
+	mov	r12d,ebx
+	rol	edx,9
+	add	edx,eax
+	not	r11d
+	lea	ecx,[((-660478335))+r10*1+rcx]
+	and	r12d,edx
+	and	r11d,eax
+	mov	r10d,DWORD[16+rsi]
+	or	r12d,r11d
+	mov	r11d,eax
+	add	ecx,r12d
+	mov	r12d,eax
+	rol	ecx,14
+	add	ecx,edx
+	not	r11d
+	lea	ebx,[((-405537848))+r10*1+rbx]
+	and	r12d,ecx
+	and	r11d,edx
+	mov	r10d,DWORD[36+rsi]
+	or	r12d,r11d
+	mov	r11d,edx
+	add	ebx,r12d
+	mov	r12d,edx
+	rol	ebx,20
+	add	ebx,ecx
+	not	r11d
+	lea	eax,[568446438+r10*1+rax]
+	and	r12d,ebx
+	and	r11d,ecx
+	mov	r10d,DWORD[56+rsi]
+	or	r12d,r11d
+	mov	r11d,ecx
+	add	eax,r12d
+	mov	r12d,ecx
+	rol	eax,5
+	add	eax,ebx
+	not	r11d
+	lea	edx,[((-1019803690))+r10*1+rdx]
+	and	r12d,eax
+	and	r11d,ebx
+	mov	r10d,DWORD[12+rsi]
+	or	r12d,r11d
+	mov	r11d,ebx
+	add	edx,r12d
+	mov	r12d,ebx
+	rol	edx,9
+	add	edx,eax
+	not	r11d
+	lea	ecx,[((-187363961))+r10*1+rcx]
+	and	r12d,edx
+	and	r11d,eax
+	mov	r10d,DWORD[32+rsi]
+	or	r12d,r11d
+	mov	r11d,eax
+	add	ecx,r12d
+	mov	r12d,eax
+	rol	ecx,14
+	add	ecx,edx
+	not	r11d
+	lea	ebx,[1163531501+r10*1+rbx]
+	and	r12d,ecx
+	and	r11d,edx
+	mov	r10d,DWORD[52+rsi]
+	or	r12d,r11d
+	mov	r11d,edx
+	add	ebx,r12d
+	mov	r12d,edx
+	rol	ebx,20
+	add	ebx,ecx
+	not	r11d
+	lea	eax,[((-1444681467))+r10*1+rax]
+	and	r12d,ebx
+	and	r11d,ecx
+	mov	r10d,DWORD[8+rsi]
+	or	r12d,r11d
+	mov	r11d,ecx
+	add	eax,r12d
+	mov	r12d,ecx
+	rol	eax,5
+	add	eax,ebx
+	not	r11d
+	lea	edx,[((-51403784))+r10*1+rdx]
+	and	r12d,eax
+	and	r11d,ebx
+	mov	r10d,DWORD[28+rsi]
+	or	r12d,r11d
+	mov	r11d,ebx
+	add	edx,r12d
+	mov	r12d,ebx
+	rol	edx,9
+	add	edx,eax
+	not	r11d
+	lea	ecx,[1735328473+r10*1+rcx]
+	and	r12d,edx
+	and	r11d,eax
+	mov	r10d,DWORD[48+rsi]
+	or	r12d,r11d
+	mov	r11d,eax
+	add	ecx,r12d
+	mov	r12d,eax
+	rol	ecx,14
+	add	ecx,edx
+	not	r11d
+	lea	ebx,[((-1926607734))+r10*1+rbx]
+	and	r12d,ecx
+	and	r11d,edx
+	mov	r10d,DWORD[rsi]
+	or	r12d,r11d
+	mov	r11d,edx
+	add	ebx,r12d
+	mov	r12d,edx
+	rol	ebx,20
+	add	ebx,ecx
+	mov	r10d,DWORD[20+rsi]
+	mov	r11d,ecx
+	lea	eax,[((-378558))+r10*1+rax]
+	mov	r10d,DWORD[32+rsi]
+	xor	r11d,edx
+	xor	r11d,ebx
+	add	eax,r11d
+	rol	eax,4
+	mov	r11d,ebx
+	add	eax,ebx
+	lea	edx,[((-2022574463))+r10*1+rdx]
+	mov	r10d,DWORD[44+rsi]
+	xor	r11d,ecx
+	xor	r11d,eax
+	add	edx,r11d
+	rol	edx,11
+	mov	r11d,eax
+	add	edx,eax
+	lea	ecx,[1839030562+r10*1+rcx]
+	mov	r10d,DWORD[56+rsi]
+	xor	r11d,ebx
+	xor	r11d,edx
+	add	ecx,r11d
+	rol	ecx,16
+	mov	r11d,edx
+	add	ecx,edx
+	lea	ebx,[((-35309556))+r10*1+rbx]
+	mov	r10d,DWORD[4+rsi]
+	xor	r11d,eax
+	xor	r11d,ecx
+	add	ebx,r11d
+	rol	ebx,23
+	mov	r11d,ecx
+	add	ebx,ecx
+	lea	eax,[((-1530992060))+r10*1+rax]
+	mov	r10d,DWORD[16+rsi]
+	xor	r11d,edx
+	xor	r11d,ebx
+	add	eax,r11d
+	rol	eax,4
+	mov	r11d,ebx
+	add	eax,ebx
+	lea	edx,[1272893353+r10*1+rdx]
+	mov	r10d,DWORD[28+rsi]
+	xor	r11d,ecx
+	xor	r11d,eax
+	add	edx,r11d
+	rol	edx,11
+	mov	r11d,eax
+	add	edx,eax
+	lea	ecx,[((-155497632))+r10*1+rcx]
+	mov	r10d,DWORD[40+rsi]
+	xor	r11d,ebx
+	xor	r11d,edx
+	add	ecx,r11d
+	rol	ecx,16
+	mov	r11d,edx
+	add	ecx,edx
+	lea	ebx,[((-1094730640))+r10*1+rbx]
+	mov	r10d,DWORD[52+rsi]
+	xor	r11d,eax
+	xor	r11d,ecx
+	add	ebx,r11d
+	rol	ebx,23
+	mov	r11d,ecx
+	add	ebx,ecx
+	lea	eax,[681279174+r10*1+rax]
+	mov	r10d,DWORD[rsi]
+	xor	r11d,edx
+	xor	r11d,ebx
+	add	eax,r11d
+	rol	eax,4
+	mov	r11d,ebx
+	add	eax,ebx
+	lea	edx,[((-358537222))+r10*1+rdx]
+	mov	r10d,DWORD[12+rsi]
+	xor	r11d,ecx
+	xor	r11d,eax
+	add	edx,r11d
+	rol	edx,11
+	mov	r11d,eax
+	add	edx,eax
+	lea	ecx,[((-722521979))+r10*1+rcx]
+	mov	r10d,DWORD[24+rsi]
+	xor	r11d,ebx
+	xor	r11d,edx
+	add	ecx,r11d
+	rol	ecx,16
+	mov	r11d,edx
+	add	ecx,edx
+	lea	ebx,[76029189+r10*1+rbx]
+	mov	r10d,DWORD[36+rsi]
+	xor	r11d,eax
+	xor	r11d,ecx
+	add	ebx,r11d
+	rol	ebx,23
+	mov	r11d,ecx
+	add	ebx,ecx
+	lea	eax,[((-640364487))+r10*1+rax]
+	mov	r10d,DWORD[48+rsi]
+	xor	r11d,edx
+	xor	r11d,ebx
+	add	eax,r11d
+	rol	eax,4
+	mov	r11d,ebx
+	add	eax,ebx
+	lea	edx,[((-421815835))+r10*1+rdx]
+	mov	r10d,DWORD[60+rsi]
+	xor	r11d,ecx
+	xor	r11d,eax
+	add	edx,r11d
+	rol	edx,11
+	mov	r11d,eax
+	add	edx,eax
+	lea	ecx,[530742520+r10*1+rcx]
+	mov	r10d,DWORD[8+rsi]
+	xor	r11d,ebx
+	xor	r11d,edx
+	add	ecx,r11d
+	rol	ecx,16
+	mov	r11d,edx
+	add	ecx,edx
+	lea	ebx,[((-995338651))+r10*1+rbx]
+	mov	r10d,DWORD[rsi]
+	xor	r11d,eax
+	xor	r11d,ecx
+	add	ebx,r11d
+	rol	ebx,23
+	mov	r11d,ecx
+	add	ebx,ecx
+	mov	r10d,DWORD[rsi]
+	mov	r11d,0xffffffff
+	xor	r11d,edx
+	lea	eax,[((-198630844))+r10*1+rax]
+	or	r11d,ebx
+	xor	r11d,ecx
+	add	eax,r11d
+	mov	r10d,DWORD[28+rsi]
+	mov	r11d,0xffffffff
+	rol	eax,6
+	xor	r11d,ecx
+	add	eax,ebx
+	lea	edx,[1126891415+r10*1+rdx]
+	or	r11d,eax
+	xor	r11d,ebx
+	add	edx,r11d
+	mov	r10d,DWORD[56+rsi]
+	mov	r11d,0xffffffff
+	rol	edx,10
+	xor	r11d,ebx
+	add	edx,eax
+	lea	ecx,[((-1416354905))+r10*1+rcx]
+	or	r11d,edx
+	xor	r11d,eax
+	add	ecx,r11d
+	mov	r10d,DWORD[20+rsi]
+	mov	r11d,0xffffffff
+	rol	ecx,15
+	xor	r11d,eax
+	add	ecx,edx
+	lea	ebx,[((-57434055))+r10*1+rbx]
+	or	r11d,ecx
+	xor	r11d,edx
+	add	ebx,r11d
+	mov	r10d,DWORD[48+rsi]
+	mov	r11d,0xffffffff
+	rol	ebx,21
+	xor	r11d,edx
+	add	ebx,ecx
+	lea	eax,[1700485571+r10*1+rax]
+	or	r11d,ebx
+	xor	r11d,ecx
+	add	eax,r11d
+	mov	r10d,DWORD[12+rsi]
+	mov	r11d,0xffffffff
+	rol	eax,6
+	xor	r11d,ecx
+	add	eax,ebx
+	lea	edx,[((-1894986606))+r10*1+rdx]
+	or	r11d,eax
+	xor	r11d,ebx
+	add	edx,r11d
+	mov	r10d,DWORD[40+rsi]
+	mov	r11d,0xffffffff
+	rol	edx,10
+	xor	r11d,ebx
+	add	edx,eax
+	lea	ecx,[((-1051523))+r10*1+rcx]
+	or	r11d,edx
+	xor	r11d,eax
+	add	ecx,r11d
+	mov	r10d,DWORD[4+rsi]
+	mov	r11d,0xffffffff
+	rol	ecx,15
+	xor	r11d,eax
+	add	ecx,edx
+	lea	ebx,[((-2054922799))+r10*1+rbx]
+	or	r11d,ecx
+	xor	r11d,edx
+	add	ebx,r11d
+	mov	r10d,DWORD[32+rsi]
+	mov	r11d,0xffffffff
+	rol	ebx,21
+	xor	r11d,edx
+	add	ebx,ecx
+	lea	eax,[1873313359+r10*1+rax]
+	or	r11d,ebx
+	xor	r11d,ecx
+	add	eax,r11d
+	mov	r10d,DWORD[60+rsi]
+	mov	r11d,0xffffffff
+	rol	eax,6
+	xor	r11d,ecx
+	add	eax,ebx
+	lea	edx,[((-30611744))+r10*1+rdx]
+	or	r11d,eax
+	xor	r11d,ebx
+	add	edx,r11d
+	mov	r10d,DWORD[24+rsi]
+	mov	r11d,0xffffffff
+	rol	edx,10
+	xor	r11d,ebx
+	add	edx,eax
+	lea	ecx,[((-1560198380))+r10*1+rcx]
+	or	r11d,edx
+	xor	r11d,eax
+	add	ecx,r11d
+	mov	r10d,DWORD[52+rsi]
+	mov	r11d,0xffffffff
+	rol	ecx,15
+	xor	r11d,eax
+	add	ecx,edx
+	lea	ebx,[1309151649+r10*1+rbx]
+	or	r11d,ecx
+	xor	r11d,edx
+	add	ebx,r11d
+	mov	r10d,DWORD[16+rsi]
+	mov	r11d,0xffffffff
+	rol	ebx,21
+	xor	r11d,edx
+	add	ebx,ecx
+	lea	eax,[((-145523070))+r10*1+rax]
+	or	r11d,ebx
+	xor	r11d,ecx
+	add	eax,r11d
+	mov	r10d,DWORD[44+rsi]
+	mov	r11d,0xffffffff
+	rol	eax,6
+	xor	r11d,ecx
+	add	eax,ebx
+	lea	edx,[((-1120210379))+r10*1+rdx]
+	or	r11d,eax
+	xor	r11d,ebx
+	add	edx,r11d
+	mov	r10d,DWORD[8+rsi]
+	mov	r11d,0xffffffff
+	rol	edx,10
+	xor	r11d,ebx
+	add	edx,eax
+	lea	ecx,[718787259+r10*1+rcx]
+	or	r11d,edx
+	xor	r11d,eax
+	add	ecx,r11d
+	mov	r10d,DWORD[36+rsi]
+	mov	r11d,0xffffffff
+	rol	ecx,15
+	xor	r11d,eax
+	add	ecx,edx
+	lea	ebx,[((-343485551))+r10*1+rbx]
+	or	r11d,ecx
+	xor	r11d,edx
+	add	ebx,r11d
+	mov	r10d,DWORD[rsi]
+	mov	r11d,0xffffffff
+	rol	ebx,21
+	xor	r11d,edx
+	add	ebx,ecx
+
+	add	eax,r8d
+	add	ebx,r9d
+	add	ecx,r14d
+	add	edx,r15d
+
+
+	add	rsi,64
+	cmp	rsi,rdi
+	jb	NEAR $L$loop
+
+
+$L$end:
+	mov	DWORD[rbp],eax
+	mov	DWORD[4+rbp],ebx
+	mov	DWORD[8+rbp],ecx
+	mov	DWORD[12+rbp],edx
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r12,QWORD[16+rsp]
+
+	mov	rbx,QWORD[24+rsp]
+
+	mov	rbp,QWORD[32+rsp]
+
+	add	rsp,40
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_md5_block_asm_data_order:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rax,[40+rax]
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r14,QWORD[((-32))+rax]
+	mov	r15,QWORD[((-40))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_md5_block_asm_data_order wrt ..imagebase
+	DD	$L$SEH_end_md5_block_asm_data_order wrt ..imagebase
+	DD	$L$SEH_info_md5_block_asm_data_order wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_md5_block_asm_data_order:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/p256-armv8-asm-apple.S b/gen/bcm/p256-armv8-asm-apple.S
new file mode 100644
index 0000000..c8469e6
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-apple.S
@@ -0,0 +1,1726 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include "openssl/arm_arch.h"
+
+.section	__TEXT,__const
+.align	5
+Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad	1,0,0,0
+Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	_ecp_nistz256_mul_mont
+.private_extern	_ecp_nistz256_mul_mont
+
+.align	4
+_ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_sqr_mont
+.private_extern	_ecp_nistz256_sqr_mont
+
+.align	4
+_ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_div_by_2
+.private_extern	_ecp_nistz256_div_by_2
+
+.align	4
+_ecp_nistz256_div_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_div_by_2
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_mul_by_2
+.private_extern	_ecp_nistz256_mul_by_2
+
+.align	4
+_ecp_nistz256_mul_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_mul_by_3
+.private_extern	_ecp_nistz256_mul_by_3
+
+.align	4
+_ecp_nistz256_mul_by_3:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	mov	x8,x4
+	mov	x9,x5
+	mov	x10,x6
+	mov	x11,x7
+
+	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+//				        const BN_ULONG x2[4]);
+.globl	_ecp_nistz256_sub
+.private_extern	_ecp_nistz256_sub
+
+.align	4
+_ecp_nistz256_sub:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	_ecp_nistz256_neg
+.private_extern	_ecp_nistz256_neg
+
+.align	4
+_ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+.globl	_ecp_nistz256_point_double
+.private_extern	_ecp_nistz256_point_double
+
+.align	5
+_ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_ecp_nistz256_point_add
+.private_extern	_ecp_nistz256_point_add
+
+.align	5
+_ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	Ldouble_shortcut
+
+.align	4
+Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_ecp_nistz256_point_add_affine
+.private_extern	_ecp_nistz256_point_add_affine
+
+.align	5
+_ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly@PAGE
+	add	x13,x13,Lpoly@PAGEOFF
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,Lone_mont@PAGE-64
+	add	x23,x23,Lone_mont@PAGEOFF-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	_ecp_nistz256_ord_mul_mont
+.private_extern	_ecp_nistz256_ord_mul_mont
+
+.align	4
+_ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord@PAGE
+	add	x23,x23,Lord@PAGEOFF
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	_ecp_nistz256_ord_sqr_mont
+.private_extern	_ecp_nistz256_ord_sqr_mont
+
+.align	4
+_ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord@PAGE
+	add	x23,x23,Lord@PAGEOFF
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	Loop_ord_sqr
+
+.align	4
+Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	_ecp_nistz256_select_w5
+.private_extern	_ecp_nistz256_select_w5
+
+.align	4
+_ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	_ecp_nistz256_select_w7
+.private_extern	_ecp_nistz256_select_w7
+
+.align	4
+_ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/p256-armv8-asm-linux.S b/gen/bcm/p256-armv8-asm-linux.S
new file mode 100644
index 0000000..28d9ac9
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-linux.S
@@ -0,0 +1,1726 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include "openssl/arm_arch.h"
+
+.section	.rodata
+.align	5
+.Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad	1,0,0,0
+.Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+.hidden	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,%function
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+.hidden	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,%function
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_div_by_2
+.hidden	ecp_nistz256_div_by_2
+.type	ecp_nistz256_div_by_2,%function
+.align	4
+ecp_nistz256_div_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_div_by_2
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_2
+.hidden	ecp_nistz256_mul_by_2
+.type	ecp_nistz256_mul_by_2,%function
+.align	4
+ecp_nistz256_mul_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_3
+.hidden	ecp_nistz256_mul_by_3
+.type	ecp_nistz256_mul_by_3,%function
+.align	4
+ecp_nistz256_mul_by_3:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	mov	x8,x4
+	mov	x9,x5
+	mov	x10,x6
+	mov	x11,x7
+
+	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+//				        const BN_ULONG x2[4]);
+.globl	ecp_nistz256_sub
+.hidden	ecp_nistz256_sub
+.type	ecp_nistz256_sub,%function
+.align	4
+ecp_nistz256_sub:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sub,.-ecp_nistz256_sub
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+.hidden	ecp_nistz256_neg
+.type	ecp_nistz256_neg,%function
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.type	__ecp_nistz256_mul_mont,%function
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.type	__ecp_nistz256_sqr_mont,%function
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type	__ecp_nistz256_add_to,%function
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type	__ecp_nistz256_sub_from,%function
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type	__ecp_nistz256_sub_morf,%function
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type	__ecp_nistz256_div_by_2,%function
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+.globl	ecp_nistz256_point_double
+.hidden	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,%function
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+.Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl	ecp_nistz256_point_add
+.hidden	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,%function
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	.Ldouble_shortcut
+
+.align	4
+.Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+.Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl	ecp_nistz256_point_add_affine
+.hidden	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,%function
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,.Lpoly
+	add	x13,x13,:lo12:.Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,.Lone_mont-64
+	add	x23,x23,:lo12:.Lone_mont-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+.hidden	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,%function
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,.Lord
+	add	x23,x23,:lo12:.Lord
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+.hidden	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,%function
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,.Lord
+	add	x23,x23,:lo12:.Lord
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	.Loop_ord_sqr
+
+.align	4
+.Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,.Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+.hidden	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,%function
+.align	4
+ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+.Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, .Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+.hidden	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,%function
+.align	4
+ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+.Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, .Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/p256-armv8-asm-win.S b/gen/bcm/p256-armv8-asm-win.S
new file mode 100644
index 0000000..a55d20d
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-win.S
@@ -0,0 +1,1766 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "openssl/arm_arch.h"
+
+.section	.rodata
+.align	5
+Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad	1,0,0,0
+Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+
+.def ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+
+.def ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_div_by_2
+
+.def ecp_nistz256_div_by_2
+   .type 32
+.endef
+.align	4
+ecp_nistz256_div_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_div_by_2
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_2
+
+.def ecp_nistz256_mul_by_2
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_3
+
+.def ecp_nistz256_mul_by_3
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_by_3:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	mov	x8,x4
+	mov	x9,x5
+	mov	x10,x6
+	mov	x11,x7
+
+	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+//				        const BN_ULONG x2[4]);
+.globl	ecp_nistz256_sub
+
+.def ecp_nistz256_sub
+   .type 32
+.endef
+.align	4
+ecp_nistz256_sub:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+
+.def ecp_nistz256_neg
+   .type 32
+.endef
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.def __ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.def __ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.def __ecp_nistz256_add_to
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_from
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_morf
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_div_by_2
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+.globl	ecp_nistz256_point_double
+
+.def ecp_nistz256_point_double
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add
+
+.def ecp_nistz256_point_add
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	Ldouble_shortcut
+
+.align	4
+Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add_affine
+
+.def ecp_nistz256_point_add_affine
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,Lone_mont-64
+	add	x23,x23,:lo12:Lone_mont-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+
+.def ecp_nistz256_ord_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+
+.def ecp_nistz256_ord_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	Loop_ord_sqr
+
+.align	4
+Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+
+.def ecp_nistz256_select_w5
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+
+.def ecp_nistz256_select_w7
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/p256-x86_64-asm-apple.S b/gen/bcm/p256-x86_64-asm-apple.S
new file mode 100644
index 0000000..81cb582
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-apple.S
@@ -0,0 +1,4473 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+.section	__DATA,__const
+.p2align	6
+L$poly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+L$One:
+.long	1,1,1,1,1,1,1,1
+L$Two:
+.long	2,2,2,2,2,2,2,2
+L$Three:
+.long	3,3,3,3,3,3,3,3
+L$ONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+L$ord:
+.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+L$ordK:
+.quad	0xccd1c8aaee00bc4f
+.text	
+
+
+
+.globl	_ecp_nistz256_neg
+.private_extern _ecp_nistz256_neg
+
+.p2align	5
+_ecp_nistz256_neg:
+
+_CET_ENDBR
+	pushq	%r12
+
+	pushq	%r13
+
+L$neg_body:
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	L$poly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r13
+
+	movq	8(%rsp),%r12
+
+	leaq	16(%rsp),%rsp
+
+L$neg_epilogue:
+	ret
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_ord_mul_mont
+.private_extern _ecp_nistz256_ord_mul_mont
+
+.p2align	5
+_ecp_nistz256_ord_mul_mont:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	L$ecp_nistz256_ord_mul_montx
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_mul_body:
+
+	movq	0(%rdx),%rax
+	movq	%rdx,%rbx
+	leaq	L$ord(%rip),%r14
+	movq	L$ordK(%rip),%r15
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,%r13
+	imulq	%r15,%r8
+
+	movq	%rdx,%r11
+	mulq	24(%rsi)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	0(%r14)
+	movq	%r8,%rbp
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%r8
+
+	mulq	8(%r14)
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	%rdx,%r10
+	movq	%rbp,%rdx
+	adcq	$0,%r8
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	8(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r8,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r9,%rcx
+	imulq	%r15,%r9
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+	mulq	0(%r14)
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	%r9,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%r9
+
+	mulq	8(%r14)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	%rdx,%r11
+	movq	%rbp,%rdx
+	adcq	$0,%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r12
+	movq	16(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r9,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r10,%rcx
+	imulq	%r15,%r10
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+	mulq	0(%r14)
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	%r10,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r12
+	sbbq	$0,%r10
+
+	mulq	8(%r14)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	%rdx,%r12
+	movq	%rbp,%rdx
+	adcq	$0,%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r13
+	movq	24(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r10,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r11,%rcx
+	imulq	%r15,%r11
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+	mulq	0(%r14)
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	%r11,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r13
+	sbbq	$0,%r11
+
+	mulq	8(%r14)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	%rdx,%r13
+	movq	%rbp,%rdx
+	adcq	$0,%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	sbbq	%rdx,%rbp
+
+	addq	%r11,%r8
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	movq	%r12,%rsi
+	subq	0(%r14),%r12
+	movq	%r13,%r11
+	sbbq	8(%r14),%r13
+	movq	%r8,%rcx
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rsi,%r12
+	cmovcq	%r11,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_mul_epilogue:
+	ret
+
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_ord_sqr_mont
+.private_extern _ecp_nistz256_ord_sqr_mont
+
+.p2align	5
+_ecp_nistz256_ord_sqr_mont:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	L$ecp_nistz256_ord_sqr_montx
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_sqr_body:
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%rax
+	movq	16(%rsi),%r14
+	movq	24(%rsi),%r15
+	leaq	L$ord(%rip),%rsi
+	movq	%rdx,%rbx
+	jmp	L$oop_ord_sqr
+
+.p2align	5
+L$oop_ord_sqr:
+
+	movq	%rax,%rbp
+	mulq	%r8
+	movq	%rax,%r9
+.byte	102,72,15,110,205
+	movq	%r14,%rax
+	movq	%rdx,%r10
+
+	mulq	%r8
+	addq	%rax,%r10
+	movq	%r15,%rax
+.byte	102,73,15,110,214
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	%r15,%rax
+.byte	102,73,15,110,223
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	movq	%rax,%r13
+	movq	%r14,%rax
+	movq	%rdx,%r14
+
+
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+
+	addq	%r15,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r14
+
+
+	xorq	%r15,%r15
+	movq	%r8,%rax
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+
+	mulq	%rax
+	movq	%rax,%r8
+.byte	102,72,15,126,200
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r9
+	adcq	%rax,%r10
+.byte	102,72,15,126,208
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r11
+	adcq	%rax,%r12
+.byte	102,72,15,126,216
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	movq	%r8,%rcx
+	imulq	32(%rsi),%r8
+
+	mulq	%rax
+	addq	%rbp,%r13
+	adcq	%rax,%r14
+	movq	0(%rsi),%rax
+	adcq	%rdx,%r15
+
+
+	mulq	%r8
+	movq	%r8,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%rbp
+
+	mulq	%r8
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	%rdx,%r10
+	movq	%r8,%rdx
+	adcq	$0,%rbp
+
+	movq	%r9,%rcx
+	imulq	32(%rsi),%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r8
+
+	addq	%rbp,%r11
+	adcq	$0,%r8
+
+
+	mulq	%r9
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%rbp
+
+	mulq	%r9
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r9,%rdx
+	adcq	$0,%rbp
+
+	movq	%r10,%rcx
+	imulq	32(%rsi),%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r9
+
+	addq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	mulq	%r10
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r8
+	sbbq	$0,%rbp
+
+	mulq	%r10
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%rbp
+
+	movq	%r11,%rcx
+	imulq	32(%rsi),%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r9
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r10
+
+	addq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	mulq	%r11
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r9
+	sbbq	$0,%rbp
+
+	mulq	%r11
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	movq	%r11,%rdx
+	adcq	$0,%rbp
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r10
+	sbbq	%rdx,%r11
+
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+
+	xorq	%rdx,%rdx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	movq	%r8,%r12
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+	subq	0(%rsi),%r8
+	movq	%r10,%r14
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r15
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r8
+	cmovncq	%r9,%rax
+	cmovncq	%r10,%r14
+	cmovncq	%r11,%r15
+
+	decq	%rbx
+	jnz	L$oop_ord_sqr
+
+	movq	%r8,0(%rdi)
+	movq	%rax,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r14,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r15,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_sqr_epilogue:
+	ret
+
+
+
+
+.p2align	5
+ecp_nistz256_ord_mul_montx:
+
+L$ecp_nistz256_ord_mul_montx:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_mulx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+	leaq	L$ord-128(%rip),%r14
+	movq	L$ordK(%rip),%r15
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	mulxq	%r11,%rbp,%r11
+	addq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcq	%rbp,%r10
+	adcq	%rcx,%r11
+	adcq	$0,%r12
+
+
+	xorq	%r13,%r13
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+	adcxq	%r8,%r12
+	adoxq	%r8,%r13
+	adcq	$0,%r13
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r13
+	adoxq	%r9,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	leaq	128(%r14),%r14
+	movq	%r12,%rbx
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	movq	%r13,%rdx
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcq	$0,%r10
+
+
+
+	movq	%r8,%rcx
+	subq	0(%r14),%r12
+	sbbq	8(%r14),%r13
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_mulx_epilogue:
+	ret
+
+
+
+
+.p2align	5
+ecp_nistz256_ord_sqr_montx:
+
+L$ecp_nistz256_ord_sqr_montx:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$ord_sqrx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	L$ord(%rip),%rsi
+	jmp	L$oop_ord_sqrx
+
+.p2align	5
+L$oop_ord_sqrx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	movq	%rdx,%rax
+.byte	102,73,15,110,206
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	addq	%rcx,%r10
+.byte	102,73,15,110,215
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+	mulxq	%r8,%rcx,%r14
+	movq	%rax,%rdx
+.byte	102,73,15,110,216
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+
+	mulxq	%rdx,%r8,%rbp
+.byte	102,72,15,126,202
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+.byte	102,72,15,126,210
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+	mulxq	%rdx,%rcx,%rbp
+.byte	0x67
+.byte	102,72,15,126,218
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	mulxq	%rdx,%rcx,%rax
+	adoxq	%rcx,%r14
+	adoxq	%rax,%r15
+
+
+	movq	%r8,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	xorq	%rax,%rax
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	adcxq	%rax,%r8
+
+
+	movq	%r9,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rax,%r9
+
+
+	movq	%r10,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	adcxq	%rax,%r10
+
+
+	movq	%r11,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rax,%r11
+
+
+	addq	%r8,%r12
+	adcq	%r13,%r9
+	movq	%r12,%rdx
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%r14
+	adcq	$0,%rax
+
+
+	subq	0(%rsi),%r12
+	movq	%r10,%r15
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r8
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rax
+
+	cmovncq	%r12,%rdx
+	cmovncq	%r9,%r14
+	cmovncq	%r10,%r15
+	cmovncq	%r11,%r8
+
+	decq	%rbx
+	jnz	L$oop_ord_sqrx
+
+	movq	%rdx,0(%rdi)
+	movq	%r14,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r15,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r8,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$ord_sqrx_epilogue:
+	ret
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_mul_mont
+.private_extern _ecp_nistz256_mul_mont
+
+.p2align	5
+_ecp_nistz256_mul_mont:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+L$mul_mont:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mul_body:
+	cmpl	$0x80100,%ecx
+	je	L$mul_montx
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+	jmp	L$mul_mont_done
+
+.p2align	5
+L$mul_montx:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+L$mul_mont_done:
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$mul_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_montq:
+
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	L$poly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	L$poly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+
+
+
+
+
+
+.globl	_ecp_nistz256_sqr_mont
+.private_extern _ecp_nistz256_sqr_mont
+
+.p2align	5
+_ecp_nistz256_sqr_mont:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqr_body:
+	cmpl	$0x80100,%ecx
+	je	L$sqr_montx
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	L$sqr_mont_done
+
+.p2align	5
+L$sqr_montx:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+L$sqr_mont_done:
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$sqr_epilogue:
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sqr_montq:
+
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	L$poly+8(%rip),%rsi
+	movq	L$poly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+
+
+
+.p2align	5
+__ecp_nistz256_mul_montx:
+
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	L$poly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	L$poly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sqr_montx:
+
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	L$poly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	L$poly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+
+
+
+
+.globl	_ecp_nistz256_select_w5
+.private_extern _ecp_nistz256_select_w5
+
+.p2align	5
+_ecp_nistz256_select_w5:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	L$avx2_select_w5
+	movdqa	L$One(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+L$select_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	L$select_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	ret
+
+L$SEH_end_ecp_nistz256_select_w5:
+
+
+
+
+.globl	_ecp_nistz256_select_w7
+.private_extern _ecp_nistz256_select_w7
+
+.p2align	5
+_ecp_nistz256_select_w7:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	L$avx2_select_w7
+	movdqa	L$One(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+L$select_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	L$select_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	ret
+
+L$SEH_end_ecp_nistz256_select_w7:
+
+
+
+
+.p2align	5
+ecp_nistz256_avx2_select_w5:
+
+L$avx2_select_w5:
+	vzeroupper
+	vmovdqa	L$Two(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	L$One(%rip),%ymm5
+	vmovdqa	L$Two(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+L$select_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	L$select_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	ret
+
+L$SEH_end_ecp_nistz256_avx2_select_w5:
+
+
+
+
+.globl	_ecp_nistz256_avx2_select_w7
+.private_extern _ecp_nistz256_avx2_select_w7
+
+.p2align	5
+_ecp_nistz256_avx2_select_w7:
+
+L$avx2_select_w7:
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	L$Three(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	L$One(%rip),%ymm4
+	vmovdqa	L$Two(%rip),%ymm8
+	vmovdqa	L$Three(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+L$select_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	L$select_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	ret
+
+L$SEH_end_ecp_nistz256_avx2_select_w7:
+
+
+.p2align	5
+__ecp_nistz256_add_toq:
+
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sub_fromq:
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_subq:
+
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_by_2q:
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+.globl	_ecp_nistz256_point_double
+.private_extern _ecp_nistz256_point_double
+
+.p2align	5
+_ecp_nistz256_point_double:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	L$point_doublex
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$160+8,%rsp
+
+L$point_doubleq_body:
+
+L$point_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	L$poly+8(%rip),%r14
+	movq	L$poly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	leaq	160+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_doubleq_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add
+.private_extern _ecp_nistz256_point_add
+
+.p2align	5
+_ecp_nistz256_point_add:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	L$point_addx
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$576+8,%rsp
+
+L$point_addq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	L$add_proceedq
+
+
+
+	testq	%r9,%r9
+	jz	L$add_doubleq
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	L$add_doneq
+
+.p2align	5
+L$add_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+
+	jmp	L$point_double_shortcutq
+
+
+.p2align	5
+L$add_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+L$add_doneq:
+	leaq	576+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_addq_epilogue:
+	ret
+
+
+.globl	_ecp_nistz256_point_add_affine
+.private_extern _ecp_nistz256_point_add_affine
+
+.p2align	5
+_ecp_nistz256_point_add_affine:
+
+_CET_ENDBR
+	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	L$point_add_affinex
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$480+8,%rsp
+
+L$add_affineq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	L$ONE_mont(%rip),%xmm2
+	pand	L$ONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$add_affineq_epilogue:
+	ret
+
+
+
+.p2align	5
+__ecp_nistz256_add_tox:
+
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_sub_fromx:
+
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_subx:
+
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	ret
+
+
+
+
+.p2align	5
+__ecp_nistz256_mul_by_2x:
+
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+
+
+
+.p2align	5
+ecp_nistz256_point_doublex:
+
+L$point_doublex:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$160+8,%rsp
+
+L$point_doublex_body:
+
+L$point_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	L$poly+8(%rip),%r14
+	movq	L$poly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	leaq	160+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_doublex_epilogue:
+	ret
+
+
+
+.p2align	5
+ecp_nistz256_point_addx:
+
+L$point_addx:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$576+8,%rsp
+
+L$point_addx_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	L$add_proceedx
+
+
+
+	testq	%r9,%r9
+	jz	L$add_doublex
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	L$add_donex
+
+.p2align	5
+L$add_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+
+	jmp	L$point_double_shortcutx
+
+
+.p2align	5
+L$add_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+L$add_donex:
+	leaq	576+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$point_addx_epilogue:
+	ret
+
+
+
+.p2align	5
+ecp_nistz256_point_add_affinex:
+
+L$point_add_affinex:
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$480+8,%rsp
+
+L$add_affinex_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	L$ONE_mont(%rip),%xmm2
+	pand	L$ONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbx
+
+	movq	-8(%rsi),%rbp
+
+	leaq	(%rsi),%rsp
+
+L$add_affinex_epilogue:
+	ret
+
+
+#endif
diff --git a/gen/bcm/p256-x86_64-asm-linux.S b/gen/bcm/p256-x86_64-asm-linux.S
new file mode 100644
index 0000000..b285543
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-linux.S
@@ -0,0 +1,4548 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+
+.section	.rodata
+.align	64
+.Lpoly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long	1,1,1,1,1,1,1,1
+.LTwo:
+.long	2,2,2,2,2,2,2,2
+.LThree:
+.long	3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+.Lord:
+.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.text	
+
+
+
+.globl	ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type	ecp_nistz256_neg,@function
+.align	32
+ecp_nistz256_neg:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+.Lneg_body:
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r13
+.cfi_restore	%r13
+	movq	8(%rsp),%r12
+.cfi_restore	%r12
+	leaq	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_mul_mont
+.hidden ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,@function
+.align	32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lecp_nistz256_ord_mul_montx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mul_body:
+
+	movq	0(%rdx),%rax
+	movq	%rdx,%rbx
+	leaq	.Lord(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,%r13
+	imulq	%r15,%r8
+
+	movq	%rdx,%r11
+	mulq	24(%rsi)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	0(%r14)
+	movq	%r8,%rbp
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%r8
+
+	mulq	8(%r14)
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	%rdx,%r10
+	movq	%rbp,%rdx
+	adcq	$0,%r8
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	8(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r8,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r9,%rcx
+	imulq	%r15,%r9
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+	mulq	0(%r14)
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	%r9,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%r9
+
+	mulq	8(%r14)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	%rdx,%r11
+	movq	%rbp,%rdx
+	adcq	$0,%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r12
+	movq	16(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r9,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r10,%rcx
+	imulq	%r15,%r10
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+	mulq	0(%r14)
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	%r10,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r12
+	sbbq	$0,%r10
+
+	mulq	8(%r14)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	%rdx,%r12
+	movq	%rbp,%rdx
+	adcq	$0,%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r13
+	movq	24(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r10,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r11,%rcx
+	imulq	%r15,%r11
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+	mulq	0(%r14)
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	%r11,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r13
+	sbbq	$0,%r11
+
+	mulq	8(%r14)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	%rdx,%r13
+	movq	%rbp,%rdx
+	adcq	$0,%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	sbbq	%rdx,%rbp
+
+	addq	%r11,%r8
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	movq	%r12,%rsi
+	subq	0(%r14),%r12
+	movq	%r13,%r11
+	sbbq	8(%r14),%r13
+	movq	%r8,%rcx
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rsi,%r12
+	cmovcq	%r11,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_sqr_mont
+.hidden ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,@function
+.align	32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lecp_nistz256_ord_sqr_montx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqr_body:
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%rax
+	movq	16(%rsi),%r14
+	movq	24(%rsi),%r15
+	leaq	.Lord(%rip),%rsi
+	movq	%rdx,%rbx
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+
+	movq	%rax,%rbp
+	mulq	%r8
+	movq	%rax,%r9
+.byte	102,72,15,110,205
+	movq	%r14,%rax
+	movq	%rdx,%r10
+
+	mulq	%r8
+	addq	%rax,%r10
+	movq	%r15,%rax
+.byte	102,73,15,110,214
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	%r15,%rax
+.byte	102,73,15,110,223
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	movq	%rax,%r13
+	movq	%r14,%rax
+	movq	%rdx,%r14
+
+
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+
+	addq	%r15,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r14
+
+
+	xorq	%r15,%r15
+	movq	%r8,%rax
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+
+	mulq	%rax
+	movq	%rax,%r8
+.byte	102,72,15,126,200
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r9
+	adcq	%rax,%r10
+.byte	102,72,15,126,208
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r11
+	adcq	%rax,%r12
+.byte	102,72,15,126,216
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	movq	%r8,%rcx
+	imulq	32(%rsi),%r8
+
+	mulq	%rax
+	addq	%rbp,%r13
+	adcq	%rax,%r14
+	movq	0(%rsi),%rax
+	adcq	%rdx,%r15
+
+
+	mulq	%r8
+	movq	%r8,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%rbp
+
+	mulq	%r8
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	%rdx,%r10
+	movq	%r8,%rdx
+	adcq	$0,%rbp
+
+	movq	%r9,%rcx
+	imulq	32(%rsi),%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r8
+
+	addq	%rbp,%r11
+	adcq	$0,%r8
+
+
+	mulq	%r9
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%rbp
+
+	mulq	%r9
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r9,%rdx
+	adcq	$0,%rbp
+
+	movq	%r10,%rcx
+	imulq	32(%rsi),%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r9
+
+	addq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	mulq	%r10
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r8
+	sbbq	$0,%rbp
+
+	mulq	%r10
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%rbp
+
+	movq	%r11,%rcx
+	imulq	32(%rsi),%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r9
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r10
+
+	addq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	mulq	%r11
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r9
+	sbbq	$0,%rbp
+
+	mulq	%r11
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	movq	%r11,%rdx
+	adcq	$0,%rbp
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r10
+	sbbq	%rdx,%r11
+
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+
+	xorq	%rdx,%rdx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	movq	%r8,%r12
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+	subq	0(%rsi),%r8
+	movq	%r10,%r14
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r15
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r8
+	cmovncq	%r9,%rax
+	cmovncq	%r10,%r14
+	cmovncq	%r11,%r15
+
+	decq	%rbx
+	jnz	.Loop_ord_sqr
+
+	movq	%r8,0(%rdi)
+	movq	%rax,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r14,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r15,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+
+.type	ecp_nistz256_ord_mul_montx,@function
+.align	32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc	
+.Lecp_nistz256_ord_mul_montx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mulx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+	leaq	.Lord-128(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	mulxq	%r11,%rbp,%r11
+	addq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcq	%rbp,%r10
+	adcq	%rcx,%r11
+	adcq	$0,%r12
+
+
+	xorq	%r13,%r13
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+	adcxq	%r8,%r12
+	adoxq	%r8,%r13
+	adcq	$0,%r13
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r13
+	adoxq	%r9,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	leaq	128(%r14),%r14
+	movq	%r12,%rbx
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	movq	%r13,%rdx
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcq	$0,%r10
+
+
+
+	movq	%r8,%rcx
+	subq	0(%r14),%r12
+	sbbq	8(%r14),%r13
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type	ecp_nistz256_ord_sqr_montx,@function
+.align	32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc	
+.Lecp_nistz256_ord_sqr_montx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqrx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	.Lord(%rip),%rsi
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	movq	%rdx,%rax
+.byte	102,73,15,110,206
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	addq	%rcx,%r10
+.byte	102,73,15,110,215
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+	mulxq	%r8,%rcx,%r14
+	movq	%rax,%rdx
+.byte	102,73,15,110,216
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+
+	mulxq	%rdx,%r8,%rbp
+.byte	102,72,15,126,202
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+.byte	102,72,15,126,210
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+	mulxq	%rdx,%rcx,%rbp
+.byte	0x67
+.byte	102,72,15,126,218
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	mulxq	%rdx,%rcx,%rax
+	adoxq	%rcx,%r14
+	adoxq	%rax,%r15
+
+
+	movq	%r8,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	xorq	%rax,%rax
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	adcxq	%rax,%r8
+
+
+	movq	%r9,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rax,%r9
+
+
+	movq	%r10,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	adcxq	%rax,%r10
+
+
+	movq	%r11,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rax,%r11
+
+
+	addq	%r8,%r12
+	adcq	%r13,%r9
+	movq	%r12,%rdx
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%r14
+	adcq	$0,%rax
+
+
+	subq	0(%rsi),%r12
+	movq	%r10,%r15
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r8
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rax
+
+	cmovncq	%r12,%rdx
+	cmovncq	%r9,%r14
+	cmovncq	%r10,%r15
+	cmovncq	%r11,%r8
+
+	decq	%rbx
+	jnz	.Loop_ord_sqrx
+
+	movq	%rdx,0(%rdi)
+	movq	%r14,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r15,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r8,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
+
+
+
+
+
+.globl	ecp_nistz256_mul_mont
+.hidden ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,@function
+.align	32
+ecp_nistz256_mul_mont:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+.Lmul_mont:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lmul_body:
+	cmpl	$0x80100,%ecx
+	je	.Lmul_montx
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+	jmp	.Lmul_mont_done
+
+.align	32
+.Lmul_montx:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+.Lmul_mont_done:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmul_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type	__ecp_nistz256_mul_montq,@function
+.align	32
+__ecp_nistz256_mul_montq:
+.cfi_startproc	
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	.Lpoly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl	ecp_nistz256_sqr_mont
+.hidden ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,@function
+.align	32
+ecp_nistz256_sqr_mont:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsqr_body:
+	cmpl	$0x80100,%ecx
+	je	.Lsqr_montx
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	.Lsqr_mont_done
+
+.align	32
+.Lsqr_montx:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+.Lsqr_mont_done:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqr_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type	__ecp_nistz256_sqr_montq,@function
+.align	32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc	
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	.Lpoly+8(%rip),%rsi
+	movq	.Lpoly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type	__ecp_nistz256_mul_montx,@function
+.align	32
+__ecp_nistz256_mul_montx:
+.cfi_startproc	
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	.Lpoly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	.Lpoly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type	__ecp_nistz256_sqr_montx,@function
+.align	32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc	
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	.Lpoly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	.Lpoly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+.globl	ecp_nistz256_select_w5
+.hidden ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,@function
+.align	32
+ecp_nistz256_select_w5:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w5
+	movdqa	.LOne(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+.Lselect_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w5:
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+
+.globl	ecp_nistz256_select_w7
+.hidden ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,@function
+.align	32
+ecp_nistz256_select_w7:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w7
+	movdqa	.LOne(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+.Lselect_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w7:
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type	ecp_nistz256_avx2_select_w5,@function
+.align	32
+ecp_nistz256_avx2_select_w5:
+.cfi_startproc	
+.Lavx2_select_w5:
+	vzeroupper
+	vmovdqa	.LTwo(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	.LOne(%rip),%ymm5
+	vmovdqa	.LTwo(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+.Lselect_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
+.globl	ecp_nistz256_avx2_select_w7
+.hidden ecp_nistz256_avx2_select_w7
+.type	ecp_nistz256_avx2_select_w7,@function
+.align	32
+ecp_nistz256_avx2_select_w7:
+.cfi_startproc	
+.Lavx2_select_w7:
+_CET_ENDBR
+	vzeroupper
+	vmovdqa	.LThree(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	.LOne(%rip),%ymm4
+	vmovdqa	.LTwo(%rip),%ymm8
+	vmovdqa	.LThree(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+.Lselect_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	ret
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.type	__ecp_nistz256_add_toq,@function
+.align	32
+__ecp_nistz256_add_toq:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type	__ecp_nistz256_sub_fromq,@function
+.align	32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc	
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type	__ecp_nistz256_subq,@function
+.align	32
+__ecp_nistz256_subq:
+.cfi_startproc	
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type	__ecp_nistz256_mul_by_2q,@function
+.align	32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl	ecp_nistz256_point_double
+.hidden ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,@function
+.align	32
+ecp_nistz256_point_double:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_doublex
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doubleq_body:
+
+.Lpoint_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doubleq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl	ecp_nistz256_point_add
+.hidden ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,@function
+.align	32
+ecp_nistz256_point_add:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_addx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedq
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doubleq
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_doneq
+
+.align	32
+.Ladd_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutq
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_doneq:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl	ecp_nistz256_point_add_affine
+.hidden ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,@function
+.align	32
+ecp_nistz256_point_add_affine:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_add_affinex
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affineq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affineq_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type	__ecp_nistz256_add_tox,@function
+.align	32
+__ecp_nistz256_add_tox:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type	__ecp_nistz256_sub_fromx,@function
+.align	32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type	__ecp_nistz256_subx,@function
+.align	32
+__ecp_nistz256_subx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type	__ecp_nistz256_mul_by_2x,@function
+.align	32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	ret
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type	ecp_nistz256_point_doublex,@function
+.align	32
+ecp_nistz256_point_doublex:
+.cfi_startproc	
+.Lpoint_doublex:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doublex_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type	ecp_nistz256_point_addx,@function
+.align	32
+ecp_nistz256_point_addx:
+.cfi_startproc	
+.Lpoint_addx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addx_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedx
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doublex
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_donex
+
+.align	32
+.Ladd_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_donex:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type	ecp_nistz256_point_add_affinex,@function
+.align	32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc	
+.Lpoint_add_affinex:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affinex_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affinex_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
+#endif
diff --git a/gen/bcm/p256-x86_64-asm-win.asm b/gen/bcm/p256-x86_64-asm-win.asm
new file mode 100644
index 0000000..c25cac3
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-win.asm
@@ -0,0 +1,5004 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+EXTERN	OPENSSL_ia32cap_P
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$poly:
+	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+
+$L$One:
+	DD	1,1,1,1,1,1,1,1
+$L$Two:
+	DD	2,2,2,2,2,2,2,2
+$L$Three:
+	DD	3,3,3,3,3,3,3,3
+$L$ONE_mont:
+	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+
+
+$L$ord:
+	DQ	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+$L$ordK:
+	DQ	0xccd1c8aaee00bc4f
+section	.text
+
+
+
+
+global	ecp_nistz256_neg
+
+ALIGN	32
+ecp_nistz256_neg:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_neg:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	push	r12
+
+	push	r13
+
+$L$neg_body:
+
+	xor	r8,r8
+	xor	r9,r9
+	xor	r10,r10
+	xor	r11,r11
+	xor	r13,r13
+
+	sub	r8,QWORD[rsi]
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	rax,r8
+	sbb	r11,QWORD[24+rsi]
+	lea	rsi,[$L$poly]
+	mov	rdx,r9
+	sbb	r13,0
+
+	add	r8,QWORD[rsi]
+	mov	rcx,r10
+	adc	r9,QWORD[8+rsi]
+	adc	r10,QWORD[16+rsi]
+	mov	r12,r11
+	adc	r11,QWORD[24+rsi]
+	test	r13,r13
+
+	cmovz	r8,rax
+	cmovz	r9,rdx
+	mov	QWORD[rdi],r8
+	cmovz	r10,rcx
+	mov	QWORD[8+rdi],r9
+	cmovz	r11,r12
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+
+	mov	r13,QWORD[rsp]
+
+	mov	r12,QWORD[8+rsp]
+
+	lea	rsp,[16+rsp]
+
+$L$neg_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_neg:
+
+
+
+
+
+
+global	ecp_nistz256_ord_mul_mont
+
+ALIGN	32
+ecp_nistz256_ord_mul_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$ecp_nistz256_ord_mul_montx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mul_body:
+
+	mov	rax,QWORD[rdx]
+	mov	rbx,rdx
+	lea	r14,[$L$ord]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	mov	r8,rax
+	mov	rax,rcx
+	mov	r9,rdx
+
+	mul	QWORD[8+rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD[16+rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	r13,r8
+	imul	r8,r15
+
+	mov	r11,rdx
+	mul	QWORD[24+rsi]
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	QWORD[r14]
+	mov	rbp,r8
+	add	r13,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rcx,rdx
+
+	sub	r10,r8
+	sbb	r8,0
+
+	mul	QWORD[8+r14]
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,rbp
+	adc	r10,rdx
+	mov	rdx,rbp
+	adc	r8,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[8+rbx]
+	sbb	rbp,rdx
+
+	add	r11,r8
+	adc	r12,rbp
+	adc	r13,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rbp
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r9
+	imul	r9,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	xor	r8,r8
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,r9
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	r9,0
+
+	mul	QWORD[8+r14]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	r11,rdx
+	mov	rdx,rbp
+	adc	r9,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r12,rax
+	mov	rax,QWORD[16+rbx]
+	sbb	rbp,rdx
+
+	add	r12,r9
+	adc	r13,rbp
+	adc	r8,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r10
+	imul	r10,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	xor	r9,r9
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,r10
+	adc	rcx,rdx
+
+	sub	r12,r10
+	sbb	r10,0
+
+	mul	QWORD[8+r14]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	r12,rdx
+	mov	rdx,rbp
+	adc	r10,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r13,rax
+	mov	rax,QWORD[24+rbx]
+	sbb	rbp,rdx
+
+	add	r13,r10
+	adc	r8,rbp
+	adc	r9,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r11
+	imul	r11,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r8,rbp
+	adc	rdx,0
+	xor	r10,r10
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,r11
+	adc	rcx,rdx
+
+	sub	r13,r11
+	sbb	r11,0
+
+	mul	QWORD[8+r14]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	r13,rdx
+	mov	rdx,rbp
+	adc	r11,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	sbb	rbp,rdx
+
+	add	r8,r11
+	adc	r9,rbp
+	adc	r10,0
+
+
+	mov	rsi,r12
+	sub	r12,QWORD[r14]
+	mov	r11,r13
+	sbb	r13,QWORD[8+r14]
+	mov	rcx,r8
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rsi
+	cmovc	r13,r11
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_mont:
+
+
+
+
+
+
+
+global	ecp_nistz256_ord_sqr_mont
+
+ALIGN	32
+ecp_nistz256_ord_sqr_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$ecp_nistz256_ord_sqr_montx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqr_body:
+
+	mov	r8,QWORD[rsi]
+	mov	rax,QWORD[8+rsi]
+	mov	r14,QWORD[16+rsi]
+	mov	r15,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	mov	rbx,rdx
+	jmp	NEAR $L$oop_ord_sqr
+
+ALIGN	32
+$L$oop_ord_sqr:
+
+	mov	rbp,rax
+	mul	r8
+	mov	r9,rax
+DB	102,72,15,110,205
+	mov	rax,r14
+	mov	r10,rdx
+
+	mul	r8
+	add	r10,rax
+	mov	rax,r15
+DB	102,73,15,110,214
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r8
+	add	r11,rax
+	mov	rax,r15
+DB	102,73,15,110,223
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	mov	r13,rax
+	mov	rax,r14
+	mov	r14,rdx
+
+
+	mul	rbp
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	rbp
+	add	r12,rax
+	adc	rdx,0
+
+	add	r12,r15
+	adc	r13,rdx
+	adc	r14,0
+
+
+	xor	r15,r15
+	mov	rax,r8
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+
+	mul	rax
+	mov	r8,rax
+DB	102,72,15,126,200
+	mov	rbp,rdx
+
+	mul	rax
+	add	r9,rbp
+	adc	r10,rax
+DB	102,72,15,126,208
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	rax
+	add	r11,rbp
+	adc	r12,rax
+DB	102,72,15,126,216
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mov	rcx,r8
+	imul	r8,QWORD[32+rsi]
+
+	mul	rax
+	add	r13,rbp
+	adc	r14,rax
+	mov	rax,QWORD[rsi]
+	adc	r15,rdx
+
+
+	mul	r8
+	mov	rbp,r8
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r10,r8
+	sbb	rbp,0
+
+	mul	r8
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,r8
+	adc	r10,rdx
+	mov	rdx,r8
+	adc	rbp,0
+
+	mov	rcx,r9
+	imul	r9,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[rsi]
+	sbb	r8,rdx
+
+	add	r11,rbp
+	adc	r8,0
+
+
+	mul	r9
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	rbp,0
+
+	mul	r9
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,r9
+	adc	r11,rdx
+	mov	rdx,r9
+	adc	rbp,0
+
+	mov	rcx,r10
+	imul	r10,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	mov	rax,QWORD[rsi]
+	sbb	r9,rdx
+
+	add	r8,rbp
+	adc	r9,0
+
+
+	mul	r10
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r8,r10
+	sbb	rbp,0
+
+	mul	r10
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,r10
+	adc	r8,rdx
+	mov	rdx,r10
+	adc	rbp,0
+
+	mov	rcx,r11
+	imul	r11,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r9,rax
+	mov	rax,QWORD[rsi]
+	sbb	r10,rdx
+
+	add	r9,rbp
+	adc	r10,0
+
+
+	mul	r11
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r9,r11
+	sbb	rbp,0
+
+	mul	r11
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	mov	rdx,r11
+	adc	rbp,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r10,rax
+	sbb	r11,rdx
+
+	add	r10,rbp
+	adc	r11,0
+
+
+	xor	rdx,rdx
+	add	r8,r12
+	adc	r9,r13
+	mov	r12,r8
+	adc	r10,r14
+	adc	r11,r15
+	mov	rax,r9
+	adc	rdx,0
+
+
+	sub	r8,QWORD[rsi]
+	mov	r14,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r15,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rdx,0
+
+	cmovc	r8,r12
+	cmovnc	rax,r9
+	cmovnc	r14,r10
+	cmovnc	r15,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqr
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],rax
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r14
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r15
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_mont:
+
+
+ALIGN	32
+ecp_nistz256_ord_mul_montx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_montx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ecp_nistz256_ord_mul_montx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mulx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+	lea	r14,[(($L$ord-128))]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mulx	r11,rbp,r11
+	add	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	mulx	rax,rdx,r15
+	adc	r10,rbp
+	adc	r11,rcx
+	adc	r12,0
+
+
+	xor	r13,r13
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[8+rbx]
+	adcx	r11,rcx
+	adox	r12,rbp
+	adcx	r12,r8
+	adox	r13,r8
+	adc	r13,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	mulx	rax,rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[16+rbx]
+	adcx	r12,rcx
+	adox	r13,rbp
+	adcx	r13,r9
+	adox	r8,r9
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	mulx	rax,rdx,r15
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[24+rbx]
+	adcx	r13,rcx
+	adox	r8,rbp
+	adcx	r8,r10
+	adox	r9,r10
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	mulx	rax,rdx,r15
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	lea	r14,[128+r14]
+	mov	rbx,r12
+	adcx	r8,rcx
+	adox	r9,rbp
+	mov	rdx,r13
+	adcx	r9,r11
+	adox	r10,r11
+	adc	r10,0
+
+
+
+	mov	rcx,r8
+	sub	r12,QWORD[r14]
+	sbb	r13,QWORD[8+r14]
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mulx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_montx:
+
+
+ALIGN	32
+ecp_nistz256_ord_sqr_montx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_montx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ecp_nistz256_ord_sqr_montx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqrx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	jmp	NEAR $L$oop_ord_sqrx
+
+ALIGN	32
+$L$oop_ord_sqrx:
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	mov	rax,rdx
+DB	102,73,15,110,206
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	add	r10,rcx
+DB	102,73,15,110,215
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+	mulx	r14,rcx,r8
+	mov	rdx,rax
+DB	102,73,15,110,216
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+
+	mulx	rbp,r8,rdx
+DB	102,72,15,126,202
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+DB	102,72,15,126,210
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+	mulx	rbp,rcx,rdx
+	DB	0x67
+DB	102,72,15,126,218
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	adox	r13,rbp
+	mulx	rax,rcx,rdx
+	adox	r14,rcx
+	adox	r15,rax
+
+
+	mov	rdx,r8
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	xor	rax,rax
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	adcx	r8,rax
+
+
+	mov	rdx,r9
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	adox	r9,rax
+
+
+	mov	rdx,r10
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	adcx	r10,rax
+
+
+	mov	rdx,r11
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	adox	r11,rax
+
+
+	add	r12,r8
+	adc	r9,r13
+	mov	rdx,r12
+	adc	r10,r14
+	adc	r11,r15
+	mov	r14,r9
+	adc	rax,0
+
+
+	sub	r12,QWORD[rsi]
+	mov	r15,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r8,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rax,0
+
+	cmovnc	rdx,r12
+	cmovnc	r14,r9
+	cmovnc	r15,r10
+	cmovnc	r8,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqrx
+
+	mov	QWORD[rdi],rdx
+	mov	QWORD[8+rdi],r14
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r15
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r8
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqrx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_montx:
+
+
+
+
+
+
+global	ecp_nistz256_mul_mont
+
+ALIGN	32
+ecp_nistz256_mul_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_mul_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+$L$mul_mont:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul_body:
+	cmp	ecx,0x80100
+	je	NEAR $L$mul_montx
+	mov	rbx,rdx
+	mov	rax,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+
+	call	__ecp_nistz256_mul_montq
+	jmp	NEAR $L$mul_mont_done
+
+ALIGN	32
+$L$mul_montx:
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_mul_montx
+$L$mul_mont_done:
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_mul_mont:
+
+
+ALIGN	32
+__ecp_nistz256_mul_montq:
+
+
+
+	mov	rbp,rax
+	mul	r9
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r8,rax
+	mov	rax,rbp
+	mov	r9,rdx
+
+	mul	r10
+	mov	r15,QWORD[(($L$poly+24))]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	r11
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r12
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	xor	r13,r13
+	mov	r12,rdx
+
+
+
+
+
+
+
+
+
+
+	mov	rbp,r8
+	shl	r8,32
+	mul	r15
+	shr	rbp,32
+	add	r9,r8
+	adc	r10,rbp
+	adc	r11,rax
+	mov	rax,QWORD[8+rbx]
+	adc	r12,rdx
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+
+	mov	rbp,r9
+	shl	r9,32
+	mul	r15
+	shr	rbp,32
+	add	r10,r9
+	adc	r11,rbp
+	adc	r12,rax
+	mov	rax,QWORD[16+rbx]
+	adc	r13,rdx
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+
+	mov	rbp,r10
+	shl	r10,32
+	mul	r15
+	shr	rbp,32
+	add	r11,r10
+	adc	r12,rbp
+	adc	r13,rax
+	mov	rax,QWORD[24+rbx]
+	adc	r8,rdx
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+
+	mov	rbp,r11
+	shl	r11,32
+	mul	r15
+	shr	rbp,32
+	add	r12,r11
+	adc	r13,rbp
+	mov	rcx,r12
+	adc	r8,rax
+	adc	r9,rdx
+	mov	rbp,r13
+	adc	r10,0
+
+
+
+	sub	r12,-1
+	mov	rbx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rdx,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rcx
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rbx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rdx
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+
+
+
+
+
+
+global	ecp_nistz256_sqr_mont
+
+ALIGN	32
+ecp_nistz256_sqr_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_sqr_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr_body:
+	cmp	ecx,0x80100
+	je	NEAR $L$sqr_montx
+	mov	rax,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	NEAR $L$sqr_mont_done
+
+ALIGN	32
+$L$sqr_montx:
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_sqr_montx
+$L$sqr_mont_done:
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_sqr_mont:
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montq:
+
+	mov	r13,rax
+	mul	r14
+	mov	r9,rax
+	mov	rax,r15
+	mov	r10,rdx
+
+	mul	r13
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r13
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	r14
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,rbp
+	mov	r13,rdx
+	adc	r13,0
+
+
+	mul	r15
+	xor	r15,r15
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	mov	r14,rdx
+	adc	r14,0
+
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+	mul	rax
+	mov	r8,rax
+	mov	rax,QWORD[8+rsi]
+	mov	rcx,rdx
+
+	mul	rax
+	add	r9,rcx
+	adc	r10,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r11,rcx
+	adc	r12,rax
+	mov	rax,QWORD[24+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r13,rcx
+	adc	r14,rax
+	mov	rax,r8
+	adc	r15,rdx
+
+	mov	rsi,QWORD[(($L$poly+8))]
+	mov	rbp,QWORD[(($L$poly+24))]
+
+
+
+
+	mov	rcx,r8
+	shl	r8,32
+	mul	rbp
+	shr	rcx,32
+	add	r9,r8
+	adc	r10,rcx
+	adc	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+
+
+
+	mov	rcx,r9
+	shl	r9,32
+	mov	r8,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r10,r9
+	adc	r11,rcx
+	adc	r8,rax
+	mov	rax,r10
+	adc	rdx,0
+
+
+
+	mov	rcx,r10
+	shl	r10,32
+	mov	r9,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r11,r10
+	adc	r8,rcx
+	adc	r9,rax
+	mov	rax,r11
+	adc	rdx,0
+
+
+
+	mov	rcx,r11
+	shl	r11,32
+	mov	r10,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r8,r11
+	adc	r9,rcx
+	adc	r10,rax
+	adc	rdx,0
+	xor	r11,r11
+
+
+
+	add	r12,r8
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,rdx
+	mov	r9,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	rcx,r15
+	sbb	r15,rbp
+	sbb	r11,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,rcx
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	ret
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_montx:
+
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mov	r14,32
+	xor	r13,r13
+	mulx	r11,rbp,r11
+	mov	r15,QWORD[(($L$poly+24))]
+	adc	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	adc	r10,rbp
+	shlx	rbp,r8,r14
+	adc	r11,rcx
+	shrx	rcx,r8,r14
+	adc	r12,0
+
+
+
+	add	r9,rbp
+	adc	r10,rcx
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[8+rbx]
+	adc	r11,rcx
+	adc	r12,rbp
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	adcx	r12,rcx
+	shlx	rcx,r9,r14
+	adox	r13,rbp
+	shrx	rbp,r9,r14
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+
+	add	r10,rcx
+	adc	r11,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[16+rbx]
+	adc	r12,rcx
+	adc	r13,rbp
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	adcx	r13,rcx
+	shlx	rcx,r10,r14
+	adox	r8,rbp
+	shrx	rbp,r10,r14
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+
+	add	r11,rcx
+	adc	r12,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[24+rbx]
+	adc	r13,rcx
+	adc	r8,rbp
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	adcx	r8,rcx
+	shlx	rcx,r11,r14
+	adox	r9,rbp
+	shrx	rbp,r11,r14
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+
+	add	r12,rcx
+	adc	r13,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rbx,r12
+	mov	r14,QWORD[(($L$poly+8))]
+	adc	r8,rcx
+	mov	rdx,r13
+	adc	r9,rbp
+	adc	r10,0
+
+
+
+	xor	eax,eax
+	mov	rcx,r8
+	sbb	r12,-1
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rbp,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rbp
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montx:
+
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	xor	eax,eax
+	adc	r10,rcx
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+
+	mulx	r14,rcx,r8
+	mov	rdx,QWORD[((0+128))+rsi]
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+	mulx	rbp,r8,rdx
+	mov	rdx,QWORD[((8+128))+rsi]
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[((16+128))+rsi]
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+	DB	0x67
+	mulx	rbp,rcx,rdx
+	mov	rdx,QWORD[((24+128))+rsi]
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	mov	rsi,32
+	adox	r13,rbp
+	DB	0x67,0x67
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[(($L$poly+24))]
+	adox	r14,rcx
+	shlx	rcx,r8,rsi
+	adox	r15,rax
+	shrx	rax,r8,rsi
+	mov	rbp,rdx
+
+
+	add	r9,rcx
+	adc	r10,rax
+
+	mulx	r8,rcx,r8
+	adc	r11,rcx
+	shlx	rcx,r9,rsi
+	adc	r8,0
+	shrx	rax,r9,rsi
+
+
+	add	r10,rcx
+	adc	r11,rax
+
+	mulx	r9,rcx,r9
+	adc	r8,rcx
+	shlx	rcx,r10,rsi
+	adc	r9,0
+	shrx	rax,r10,rsi
+
+
+	add	r11,rcx
+	adc	r8,rax
+
+	mulx	r10,rcx,r10
+	adc	r9,rcx
+	shlx	rcx,r11,rsi
+	adc	r10,0
+	shrx	rax,r11,rsi
+
+
+	add	r8,rcx
+	adc	r9,rax
+
+	mulx	r11,rcx,r11
+	adc	r10,rcx
+	adc	r11,0
+
+	xor	rdx,rdx
+	add	r12,r8
+	mov	rsi,QWORD[(($L$poly+8))]
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,r11
+	mov	r9,r13
+	adc	rdx,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	r11,r15
+	sbb	r15,rbp
+	sbb	rdx,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,r11
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	ret
+
+
+
+
+global	ecp_nistz256_select_w5
+
+ALIGN	32
+ecp_nistz256_select_w5:
+
+_CET_ENDBR
+	lea	rax,[OPENSSL_ia32cap_P]
+	mov	rax,QWORD[8+rax]
+	test	eax,32
+	jnz	NEAR $L$avx2_select_w5
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w5:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0x0f,0x29,0x70,0xe0
+	DB	0x0f,0x29,0x78,0xf0
+	DB	0x44,0x0f,0x29,0x00
+	DB	0x44,0x0f,0x29,0x48,0x10
+	DB	0x44,0x0f,0x29,0x50,0x20
+	DB	0x44,0x0f,0x29,0x58,0x30
+	DB	0x44,0x0f,0x29,0x60,0x40
+	DB	0x44,0x0f,0x29,0x68,0x50
+	DB	0x44,0x0f,0x29,0x70,0x60
+	DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm0,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+
+	movdqa	xmm8,xmm0
+	pshufd	xmm1,xmm1,0
+
+	mov	rax,16
+$L$select_loop_sse_w5:
+
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	pcmpeqd	xmm15,xmm1
+
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	movdqa	xmm13,XMMWORD[64+rdx]
+	movdqa	xmm14,XMMWORD[80+rdx]
+	lea	rdx,[96+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	pand	xmm13,xmm15
+	por	xmm5,xmm12
+	pand	xmm14,xmm15
+	por	xmm6,xmm13
+	por	xmm7,xmm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w5
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movdqu	XMMWORD[64+rcx],xmm6
+	movdqu	XMMWORD[80+rcx],xmm7
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w5:
+
+
+
+
+global	ecp_nistz256_select_w7
+
+ALIGN	32
+ecp_nistz256_select_w7:
+
+_CET_ENDBR
+	lea	rax,[OPENSSL_ia32cap_P]
+	mov	rax,QWORD[8+rax]
+	test	eax,32
+	jnz	NEAR $L$avx2_select_w7
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w7:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0x0f,0x29,0x70,0xe0
+	DB	0x0f,0x29,0x78,0xf0
+	DB	0x44,0x0f,0x29,0x00
+	DB	0x44,0x0f,0x29,0x48,0x10
+	DB	0x44,0x0f,0x29,0x50,0x20
+	DB	0x44,0x0f,0x29,0x58,0x30
+	DB	0x44,0x0f,0x29,0x60,0x40
+	DB	0x44,0x0f,0x29,0x68,0x50
+	DB	0x44,0x0f,0x29,0x70,0x60
+	DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm8,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+
+	movdqa	xmm0,xmm8
+	pshufd	xmm1,xmm1,0
+	mov	rax,64
+
+$L$select_loop_sse_w7:
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	pcmpeqd	xmm15,xmm1
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	lea	rdx,[64+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	prefetcht0	[255+rdx]
+	por	xmm5,xmm12
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w7
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_select_w7:
+
+
+
+
+ALIGN	32
+ecp_nistz256_avx2_select_w5:
+
+$L$avx2_select_w5:
+	vzeroupper
+	lea	rax,[((-136))+rsp]
+	mov	r11,rsp
+$L$SEH_begin_ecp_nistz256_avx2_select_w5:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0xc5,0xf8,0x29,0x70,0xe0
+	DB	0xc5,0xf8,0x29,0x78,0xf0
+	DB	0xc5,0x78,0x29,0x40,0x00
+	DB	0xc5,0x78,0x29,0x48,0x10
+	DB	0xc5,0x78,0x29,0x50,0x20
+	DB	0xc5,0x78,0x29,0x58,0x30
+	DB	0xc5,0x78,0x29,0x60,0x40
+	DB	0xc5,0x78,0x29,0x68,0x50
+	DB	0xc5,0x78,0x29,0x70,0x60
+	DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Two]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+	vpxor	ymm4,ymm4,ymm4
+
+	vmovdqa	ymm5,YMMWORD[$L$One]
+	vmovdqa	ymm10,YMMWORD[$L$Two]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+	mov	rax,8
+$L$select_loop_avx2_w5:
+
+	vmovdqa	ymm6,YMMWORD[rdx]
+	vmovdqa	ymm7,YMMWORD[32+rdx]
+	vmovdqa	ymm8,YMMWORD[64+rdx]
+
+	vmovdqa	ymm11,YMMWORD[96+rdx]
+	vmovdqa	ymm12,YMMWORD[128+rdx]
+	vmovdqa	ymm13,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm9,ymm5,ymm1
+	vpcmpeqd	ymm14,ymm10,ymm1
+
+	vpaddd	ymm5,ymm5,ymm0
+	vpaddd	ymm10,ymm10,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm6,ymm6,ymm9
+	vpand	ymm7,ymm7,ymm9
+	vpand	ymm8,ymm8,ymm9
+	vpand	ymm11,ymm11,ymm14
+	vpand	ymm12,ymm12,ymm14
+	vpand	ymm13,ymm13,ymm14
+
+	vpxor	ymm2,ymm2,ymm6
+	vpxor	ymm3,ymm3,ymm7
+	vpxor	ymm4,ymm4,ymm8
+	vpxor	ymm2,ymm2,ymm11
+	vpxor	ymm3,ymm3,ymm12
+	vpxor	ymm4,ymm4,ymm13
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w5
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vmovdqu	YMMWORD[64+rcx],ymm4
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	ret
+
+$L$SEH_end_ecp_nistz256_avx2_select_w5:
+
+
+
+
+global	ecp_nistz256_avx2_select_w7
+
+ALIGN	32
+ecp_nistz256_avx2_select_w7:
+
+$L$avx2_select_w7:
+_CET_ENDBR
+	vzeroupper
+	mov	r11,rsp
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_avx2_select_w7:
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0xc5,0xf8,0x29,0x70,0xe0
+	DB	0xc5,0xf8,0x29,0x78,0xf0
+	DB	0xc5,0x78,0x29,0x40,0x00
+	DB	0xc5,0x78,0x29,0x48,0x10
+	DB	0xc5,0x78,0x29,0x50,0x20
+	DB	0xc5,0x78,0x29,0x58,0x30
+	DB	0xc5,0x78,0x29,0x60,0x40
+	DB	0xc5,0x78,0x29,0x68,0x50
+	DB	0xc5,0x78,0x29,0x70,0x60
+	DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Three]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+
+	vmovdqa	ymm4,YMMWORD[$L$One]
+	vmovdqa	ymm8,YMMWORD[$L$Two]
+	vmovdqa	ymm12,YMMWORD[$L$Three]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+
+	mov	rax,21
+$L$select_loop_avx2_w7:
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vmovdqa	ymm9,YMMWORD[64+rdx]
+	vmovdqa	ymm10,YMMWORD[96+rdx]
+
+	vmovdqa	ymm13,YMMWORD[128+rdx]
+	vmovdqa	ymm14,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+	vpcmpeqd	ymm11,ymm8,ymm1
+	vpcmpeqd	ymm15,ymm12,ymm1
+
+	vpaddd	ymm4,ymm4,ymm0
+	vpaddd	ymm8,ymm8,ymm0
+	vpaddd	ymm12,ymm12,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+	vpand	ymm9,ymm9,ymm11
+	vpand	ymm10,ymm10,ymm11
+	vpand	ymm13,ymm13,ymm15
+	vpand	ymm14,ymm14,ymm15
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+	vpxor	ymm2,ymm2,ymm9
+	vpxor	ymm3,ymm3,ymm10
+	vpxor	ymm2,ymm2,ymm13
+	vpxor	ymm3,ymm3,ymm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w7
+
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	ret
+
+$L$SEH_end_ecp_nistz256_avx2_select_w7:
+
+
+ALIGN	32
+__ecp_nistz256_add_toq:
+
+	xor	r11,r11
+	add	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromq:
+
+	sub	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,r11
+
+	add	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+	test	r11,r11
+
+	cmovz	r12,rax
+	cmovz	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovz	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovz	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subq:
+
+	sub	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,r11
+
+	add	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+	test	r11,r11
+
+	cmovnz	r12,rax
+	cmovnz	r13,rbp
+	cmovnz	r8,rcx
+	cmovnz	r9,r10
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2q:
+
+	xor	r11,r11
+	add	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+global	ecp_nistz256_point_double
+
+ALIGN	32
+ecp_nistz256_point_double:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_double:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_doublex
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doubleq_body:
+
+$L$point_double_shortcutq:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-0))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rax,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subq
+
+	mov	rax,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-0))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doubleq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_double:
+global	ecp_nistz256_point_add
+
+ALIGN	32
+ecp_nistz256_point_add:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_addx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-0))+rsi]
+	mov	QWORD[((544+0))+rsp],rax
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rax,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-0))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((0+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+	DB	0x3e
+	jnz	NEAR $L$add_proceedq
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doubleq
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_doneq
+
+ALIGN	32
+$L$add_doubleq:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutq
+
+
+ALIGN	32
+$L$add_proceedq:
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((0+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_doneq:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add:
+global	ecp_nistz256_point_add_affine
+
+ALIGN	32
+ecp_nistz256_point_add_affine:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affine:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rcx,[OPENSSL_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_add_affinex
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affineq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rax,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-0))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((0+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((0+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affineq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_affine:
+
+ALIGN	32
+__ecp_nistz256_add_tox:
+
+	xor	r11,r11
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromx:
+
+	xor	r11,r11
+	sbb	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,0
+
+	xor	r10,r10
+	adc	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+
+	bt	r11,0
+	cmovnc	r12,rax
+	cmovnc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovnc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovnc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subx:
+
+	xor	r11,r11
+	sbb	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,0
+
+	xor	r9,r9
+	adc	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+
+	bt	r11,0
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	cmovc	r8,rcx
+	cmovc	r9,r10
+
+	ret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2x:
+
+	xor	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	ret
+
+
+
+ALIGN	32
+ecp_nistz256_point_doublex:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_doublex:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+$L$point_doublex:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doublex_body:
+
+$L$point_double_shortcutx:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-128))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rdx,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subx
+
+	mov	rdx,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-128))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doublex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_doublex:
+
+ALIGN	32
+ecp_nistz256_point_addx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_addx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$point_addx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addx_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-128))+rsi]
+	mov	QWORD[((544+0))+rsp],rdx
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rdx,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-128))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((-128+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+	DB	0x3e
+	jnz	NEAR $L$add_proceedx
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doublex
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_donex
+
+ALIGN	32
+$L$add_doublex:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutx
+
+
+ALIGN	32
+$L$add_proceedx:
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((-128+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_donex:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_addx:
+
+ALIGN	32
+ecp_nistz256_point_add_affinex:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affinex:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$point_add_affinex:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affinex_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rdx,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-128))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((-128+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((-128+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affinex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ecp_nistz256_point_add_affinex:
+EXTERN	__imp_RtlVirtualUnwind
+
+
+ALIGN	16
+short_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[16+rax]
+
+	mov	r12,QWORD[((-8))+rax]
+	mov	r13,QWORD[((-16))+rax]
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[8+r11]
+	lea	rax,[r10*1+rax]
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_ecp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_neg wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_mul_mont wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_mul_mont wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_mul_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_mul_mont wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_mul_mont wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_mul_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_sqr_mont wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_sqr_mont wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_sqr_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_select_w5 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w5 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_select_w7 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_select_w7 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_avx2_select_w5 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_avx2_select_w5 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_avx2_select_w7 wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_avx2_select_w7 wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_double wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_double wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ecp_nistz256_neg:
+	DB	9,0,0,0
+	DD	short_handler wrt ..imagebase
+	DD	$L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase
+$L$SEH_info_ecp_nistz256_ord_mul_mont:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_mont:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_mul_montx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_montx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_mul_mont:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_sqr_mont:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_select_wX:
+	DB	0x01,0x33,0x16,0x00
+	DB	0x33,0xf8,0x09,0x00
+	DB	0x2e,0xe8,0x08,0x00
+	DB	0x29,0xd8,0x07,0x00
+	DB	0x24,0xc8,0x06,0x00
+	DB	0x1f,0xb8,0x05,0x00
+	DB	0x1a,0xa8,0x04,0x00
+	DB	0x15,0x98,0x03,0x00
+	DB	0x10,0x88,0x02,0x00
+	DB	0x0c,0x78,0x01,0x00
+	DB	0x08,0x68,0x00,0x00
+	DB	0x04,0x01,0x15,0x00
+ALIGN	8
+$L$SEH_info_ecp_nistz256_avx2_select_wX:
+	DB	0x01,0x36,0x17,0x0b
+	DB	0x36,0xf8,0x09,0x00
+	DB	0x31,0xe8,0x08,0x00
+	DB	0x2c,0xd8,0x07,0x00
+	DB	0x27,0xc8,0x06,0x00
+	DB	0x22,0xb8,0x05,0x00
+	DB	0x1d,0xa8,0x04,0x00
+	DB	0x18,0x98,0x03,0x00
+	DB	0x13,0x88,0x02,0x00
+	DB	0x0e,0x78,0x01,0x00
+	DB	0x09,0x68,0x00,0x00
+	DB	0x04,0x01,0x15,0x00
+	DB	0x00,0xb3,0x00,0x00
+ALIGN	8
+$L$SEH_info_ecp_nistz256_point_double:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_ecp_nistz256_point_add:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affine:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase
+	DD	32*15+56,0
+ALIGN	8
+$L$SEH_info_ecp_nistz256_point_doublex:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_ecp_nistz256_point_addx:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affinex:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase
+	DD	32*15+56,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/p256_beeu-armv8-asm-apple.S b/gen/bcm/p256_beeu-armv8-asm-apple.S
new file mode 100644
index 0000000..49ea9b8
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-apple.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include "openssl/arm_arch.h"
+
+.text
+.globl	_beeu_mod_inverse_vartime
+.private_extern	_beeu_mod_inverse_vartime
+
+.align	4
+_beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_X:
+	tbz	x3, #0, Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_Y:
+	tbz	x8, #0, Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	Lbeeu_loop
+
+Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	Lbeeu_finish
+
+Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/p256_beeu-armv8-asm-linux.S b/gen/bcm/p256_beeu-armv8-asm-linux.S
new file mode 100644
index 0000000..8e09b61
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-linux.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include "openssl/arm_arch.h"
+
+.text
+.globl	beeu_mod_inverse_vartime
+.hidden	beeu_mod_inverse_vartime
+.type	beeu_mod_inverse_vartime, %function
+.align	4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+.Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,.Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, .Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+.Lbeeu_shift_loop_X:
+	tbz	x3, #0, .Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+.Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	.Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, .Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+.Lbeeu_shift_loop_Y:
+	tbz	x8, #0, .Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+.Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	.Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	.Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	.Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	.Lbeeu_loop
+
+.Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, .Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	.Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	.Lbeeu_finish
+
+.Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+.Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/p256_beeu-armv8-asm-win.S b/gen/bcm/p256_beeu-armv8-asm-win.S
new file mode 100644
index 0000000..ac6eb17
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-win.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "openssl/arm_arch.h"
+
+.text
+.globl	beeu_mod_inverse_vartime
+
+
+.align	4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_X:
+	tbz	x3, #0, Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_Y:
+	tbz	x8, #0, Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	Lbeeu_loop
+
+Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	Lbeeu_finish
+
+Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/p256_beeu-x86_64-asm-apple.S b/gen/bcm/p256_beeu-x86_64-asm-apple.S
new file mode 100644
index 0000000..fc6552c
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-apple.S
@@ -0,0 +1,322 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+.private_extern	_beeu_mod_inverse_vartime
+.globl	_beeu_mod_inverse_vartime
+.private_extern _beeu_mod_inverse_vartime
+.p2align	5
+_beeu_mod_inverse_vartime:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rbx
+
+	pushq	%rsi
+
+
+	subq	$80,%rsp
+
+	movq	%rdi,0(%rsp)
+
+
+	movq	$1,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%rdi,%rdi
+
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	xorq	%rbp,%rbp
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	%xmm0,48(%rsp)
+	vmovdqu	%xmm1,64(%rsp)
+
+	vmovdqu	0(%rdx),%xmm0
+	vmovdqu	16(%rdx),%xmm1
+	vmovdqu	%xmm0,16(%rsp)
+	vmovdqu	%xmm1,32(%rsp)
+
+L$beeu_loop:
+	xorq	%rbx,%rbx
+	orq	48(%rsp),%rbx
+	orq	56(%rsp),%rbx
+	orq	64(%rsp),%rbx
+	orq	72(%rsp),%rbx
+	jz	L$beeu_loop_end
+
+
+
+
+
+
+
+
+
+
+	movq	$1,%rcx
+
+
+L$beeu_shift_loop_XB:
+	movq	%rcx,%rbx
+	andq	48(%rsp),%rbx
+	jnz	L$beeu_shift_loop_end_XB
+
+
+	movq	$1,%rbx
+	andq	%r8,%rbx
+	jz	L$shift1_0
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	$0,%rdi
+
+L$shift1_0:
+	shrdq	$1,%r9,%r8
+	shrdq	$1,%r10,%r9
+	shrdq	$1,%r11,%r10
+	shrdq	$1,%rdi,%r11
+	shrq	$1,%rdi
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	L$beeu_shift_loop_XB
+
+L$beeu_shift_loop_end_XB:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	L$beeu_no_shift_XB
+
+
+
+	movq	8+48(%rsp),%rax
+	movq	16+48(%rsp),%rbx
+	movq	24+48(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+48(%rsp)
+	shrdq	%cl,%rbx,8+48(%rsp)
+	shrdq	%cl,%rsi,16+48(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+48(%rsp)
+
+
+L$beeu_no_shift_XB:
+
+	movq	$1,%rcx
+
+
+L$beeu_shift_loop_YA:
+	movq	%rcx,%rbx
+	andq	16(%rsp),%rbx
+	jnz	L$beeu_shift_loop_end_YA
+
+
+	movq	$1,%rbx
+	andq	%r12,%rbx
+	jz	L$shift1_1
+	addq	0(%rdx),%r12
+	adcq	8(%rdx),%r13
+	adcq	16(%rdx),%r14
+	adcq	24(%rdx),%r15
+	adcq	$0,%rbp
+
+L$shift1_1:
+	shrdq	$1,%r13,%r12
+	shrdq	$1,%r14,%r13
+	shrdq	$1,%r15,%r14
+	shrdq	$1,%rbp,%r15
+	shrq	$1,%rbp
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	L$beeu_shift_loop_YA
+
+L$beeu_shift_loop_end_YA:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	L$beeu_no_shift_YA
+
+
+
+	movq	8+16(%rsp),%rax
+	movq	16+16(%rsp),%rbx
+	movq	24+16(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+16(%rsp)
+	shrdq	%cl,%rbx,8+16(%rsp)
+	shrdq	%cl,%rsi,16+16(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+16(%rsp)
+
+
+L$beeu_no_shift_YA:
+
+	movq	48(%rsp),%rax
+	movq	56(%rsp),%rbx
+	movq	64(%rsp),%rsi
+	movq	72(%rsp),%rcx
+	subq	16(%rsp),%rax
+	sbbq	24(%rsp),%rbx
+	sbbq	32(%rsp),%rsi
+	sbbq	40(%rsp),%rcx
+	jnc	L$beeu_B_bigger_than_A
+
+
+	movq	16(%rsp),%rax
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rsi
+	movq	40(%rsp),%rcx
+	subq	48(%rsp),%rax
+	sbbq	56(%rsp),%rbx
+	sbbq	64(%rsp),%rsi
+	sbbq	72(%rsp),%rcx
+	movq	%rax,16(%rsp)
+	movq	%rbx,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	adcq	%rdi,%rbp
+	jmp	L$beeu_loop
+
+L$beeu_B_bigger_than_A:
+
+	movq	%rax,48(%rsp)
+	movq	%rbx,56(%rsp)
+	movq	%rsi,64(%rsp)
+	movq	%rcx,72(%rsp)
+
+
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rbp,%rdi
+
+	jmp	L$beeu_loop
+
+L$beeu_loop_end:
+
+
+
+
+	movq	16(%rsp),%rbx
+	subq	$1,%rbx
+	orq	24(%rsp),%rbx
+	orq	32(%rsp),%rbx
+	orq	40(%rsp),%rbx
+
+	jnz	L$beeu_err
+
+
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	xorq	%rdi,%rdi
+
+L$beeu_reduction_loop:
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+	movq	%r14,32(%rsp)
+	movq	%r15,40(%rsp)
+	movq	%rbp,48(%rsp)
+
+
+	subq	%r8,%r12
+	sbbq	%r9,%r13
+	sbbq	%r10,%r14
+	sbbq	%r11,%r15
+	sbbq	$0,%rbp
+
+
+	cmovcq	16(%rsp),%r12
+	cmovcq	24(%rsp),%r13
+	cmovcq	32(%rsp),%r14
+	cmovcq	40(%rsp),%r15
+	jnc	L$beeu_reduction_loop
+
+
+	subq	%r12,%r8
+	sbbq	%r13,%r9
+	sbbq	%r14,%r10
+	sbbq	%r15,%r11
+
+L$beeu_save:
+
+	movq	0(%rsp),%rdi
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+
+	movq	$1,%rax
+	jmp	L$beeu_finish
+
+L$beeu_err:
+
+	xorq	%rax,%rax
+
+L$beeu_finish:
+	addq	$80,%rsp
+
+	popq	%rsi
+
+	popq	%rbx
+
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbp
+
+	ret
+
+
+
+#endif
diff --git a/gen/bcm/p256_beeu-x86_64-asm-linux.S b/gen/bcm/p256_beeu-x86_64-asm-linux.S
new file mode 100644
index 0000000..40ae58b
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-linux.S
@@ -0,0 +1,336 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.type	beeu_mod_inverse_vartime,@function
+.hidden	beeu_mod_inverse_vartime
+.globl	beeu_mod_inverse_vartime
+.hidden beeu_mod_inverse_vartime
+.align	32
+beeu_mod_inverse_vartime:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-16
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-24
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r13,-32
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-40
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-48
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rsi,-64
+
+	subq	$80,%rsp
+.cfi_adjust_cfa_offset	80
+	movq	%rdi,0(%rsp)
+
+
+	movq	$1,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%rdi,%rdi
+
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	xorq	%rbp,%rbp
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	%xmm0,48(%rsp)
+	vmovdqu	%xmm1,64(%rsp)
+
+	vmovdqu	0(%rdx),%xmm0
+	vmovdqu	16(%rdx),%xmm1
+	vmovdqu	%xmm0,16(%rsp)
+	vmovdqu	%xmm1,32(%rsp)
+
+.Lbeeu_loop:
+	xorq	%rbx,%rbx
+	orq	48(%rsp),%rbx
+	orq	56(%rsp),%rbx
+	orq	64(%rsp),%rbx
+	orq	72(%rsp),%rbx
+	jz	.Lbeeu_loop_end
+
+
+
+
+
+
+
+
+
+
+	movq	$1,%rcx
+
+
+.Lbeeu_shift_loop_XB:
+	movq	%rcx,%rbx
+	andq	48(%rsp),%rbx
+	jnz	.Lbeeu_shift_loop_end_XB
+
+
+	movq	$1,%rbx
+	andq	%r8,%rbx
+	jz	.Lshift1_0
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	$0,%rdi
+
+.Lshift1_0:
+	shrdq	$1,%r9,%r8
+	shrdq	$1,%r10,%r9
+	shrdq	$1,%r11,%r10
+	shrdq	$1,%rdi,%r11
+	shrq	$1,%rdi
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	.Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	.Lbeeu_no_shift_XB
+
+
+
+	movq	8+48(%rsp),%rax
+	movq	16+48(%rsp),%rbx
+	movq	24+48(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+48(%rsp)
+	shrdq	%cl,%rbx,8+48(%rsp)
+	shrdq	%cl,%rsi,16+48(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+48(%rsp)
+
+
+.Lbeeu_no_shift_XB:
+
+	movq	$1,%rcx
+
+
+.Lbeeu_shift_loop_YA:
+	movq	%rcx,%rbx
+	andq	16(%rsp),%rbx
+	jnz	.Lbeeu_shift_loop_end_YA
+
+
+	movq	$1,%rbx
+	andq	%r12,%rbx
+	jz	.Lshift1_1
+	addq	0(%rdx),%r12
+	adcq	8(%rdx),%r13
+	adcq	16(%rdx),%r14
+	adcq	24(%rdx),%r15
+	adcq	$0,%rbp
+
+.Lshift1_1:
+	shrdq	$1,%r13,%r12
+	shrdq	$1,%r14,%r13
+	shrdq	$1,%r15,%r14
+	shrdq	$1,%rbp,%r15
+	shrq	$1,%rbp
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	.Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	.Lbeeu_no_shift_YA
+
+
+
+	movq	8+16(%rsp),%rax
+	movq	16+16(%rsp),%rbx
+	movq	24+16(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+16(%rsp)
+	shrdq	%cl,%rbx,8+16(%rsp)
+	shrdq	%cl,%rsi,16+16(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+16(%rsp)
+
+
+.Lbeeu_no_shift_YA:
+
+	movq	48(%rsp),%rax
+	movq	56(%rsp),%rbx
+	movq	64(%rsp),%rsi
+	movq	72(%rsp),%rcx
+	subq	16(%rsp),%rax
+	sbbq	24(%rsp),%rbx
+	sbbq	32(%rsp),%rsi
+	sbbq	40(%rsp),%rcx
+	jnc	.Lbeeu_B_bigger_than_A
+
+
+	movq	16(%rsp),%rax
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rsi
+	movq	40(%rsp),%rcx
+	subq	48(%rsp),%rax
+	sbbq	56(%rsp),%rbx
+	sbbq	64(%rsp),%rsi
+	sbbq	72(%rsp),%rcx
+	movq	%rax,16(%rsp)
+	movq	%rbx,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	adcq	%rdi,%rbp
+	jmp	.Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+
+	movq	%rax,48(%rsp)
+	movq	%rbx,56(%rsp)
+	movq	%rsi,64(%rsp)
+	movq	%rcx,72(%rsp)
+
+
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rbp,%rdi
+
+	jmp	.Lbeeu_loop
+
+.Lbeeu_loop_end:
+
+
+
+
+	movq	16(%rsp),%rbx
+	subq	$1,%rbx
+	orq	24(%rsp),%rbx
+	orq	32(%rsp),%rbx
+	orq	40(%rsp),%rbx
+
+	jnz	.Lbeeu_err
+
+
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	xorq	%rdi,%rdi
+
+.Lbeeu_reduction_loop:
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+	movq	%r14,32(%rsp)
+	movq	%r15,40(%rsp)
+	movq	%rbp,48(%rsp)
+
+
+	subq	%r8,%r12
+	sbbq	%r9,%r13
+	sbbq	%r10,%r14
+	sbbq	%r11,%r15
+	sbbq	$0,%rbp
+
+
+	cmovcq	16(%rsp),%r12
+	cmovcq	24(%rsp),%r13
+	cmovcq	32(%rsp),%r14
+	cmovcq	40(%rsp),%r15
+	jnc	.Lbeeu_reduction_loop
+
+
+	subq	%r12,%r8
+	sbbq	%r13,%r9
+	sbbq	%r14,%r10
+	sbbq	%r15,%r11
+
+.Lbeeu_save:
+
+	movq	0(%rsp),%rdi
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+
+	movq	$1,%rax
+	jmp	.Lbeeu_finish
+
+.Lbeeu_err:
+
+	xorq	%rax,%rax
+
+.Lbeeu_finish:
+	addq	$80,%rsp
+.cfi_adjust_cfa_offset	-80
+	popq	%rsi
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rsi
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rbx
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r12
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rbp
+	ret
+.cfi_endproc	
+
+.size	beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+#endif
diff --git a/gen/bcm/p256_beeu-x86_64-asm-win.asm b/gen/bcm/p256_beeu-x86_64-asm-win.asm
new file mode 100644
index 0000000..7c7da68
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-win.asm
@@ -0,0 +1,346 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+
+global	beeu_mod_inverse_vartime
+ALIGN	32
+beeu_mod_inverse_vartime:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_beeu_mod_inverse_vartime:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rbx
+
+	push	rsi
+
+
+	sub	rsp,80
+
+	mov	QWORD[rsp],rdi
+
+
+	mov	r8,1
+	xor	r9,r9
+	xor	r10,r10
+	xor	r11,r11
+	xor	rdi,rdi
+
+	xor	r12,r12
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	xor	rbp,rbp
+
+
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	XMMWORD[48+rsp],xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm1
+
+	vmovdqu	xmm0,XMMWORD[rdx]
+	vmovdqu	xmm1,XMMWORD[16+rdx]
+	vmovdqu	XMMWORD[16+rsp],xmm0
+	vmovdqu	XMMWORD[32+rsp],xmm1
+
+$L$beeu_loop:
+	xor	rbx,rbx
+	or	rbx,QWORD[48+rsp]
+	or	rbx,QWORD[56+rsp]
+	or	rbx,QWORD[64+rsp]
+	or	rbx,QWORD[72+rsp]
+	jz	NEAR $L$beeu_loop_end
+
+
+
+
+
+
+
+
+
+
+	mov	rcx,1
+
+
+$L$beeu_shift_loop_XB:
+	mov	rbx,rcx
+	and	rbx,QWORD[48+rsp]
+	jnz	NEAR $L$beeu_shift_loop_end_XB
+
+
+	mov	rbx,1
+	and	rbx,r8
+	jz	NEAR $L$shift1_0
+	add	r8,QWORD[rdx]
+	adc	r9,QWORD[8+rdx]
+	adc	r10,QWORD[16+rdx]
+	adc	r11,QWORD[24+rdx]
+	adc	rdi,0
+
+$L$shift1_0:
+	shrd	r8,r9,1
+	shrd	r9,r10,1
+	shrd	r10,r11,1
+	shrd	r11,rdi,1
+	shr	rdi,1
+
+	shl	rcx,1
+
+
+
+
+
+	cmp	rcx,0x8000000
+	jne	NEAR $L$beeu_shift_loop_XB
+
+$L$beeu_shift_loop_end_XB:
+	bsf	rcx,rcx
+	test	rcx,rcx
+	jz	NEAR $L$beeu_no_shift_XB
+
+
+
+	mov	rax,QWORD[((8+48))+rsp]
+	mov	rbx,QWORD[((16+48))+rsp]
+	mov	rsi,QWORD[((24+48))+rsp]
+
+	shrd	QWORD[((0+48))+rsp],rax,cl
+	shrd	QWORD[((8+48))+rsp],rbx,cl
+	shrd	QWORD[((16+48))+rsp],rsi,cl
+
+	shr	rsi,cl
+	mov	QWORD[((24+48))+rsp],rsi
+
+
+$L$beeu_no_shift_XB:
+
+	mov	rcx,1
+
+
+$L$beeu_shift_loop_YA:
+	mov	rbx,rcx
+	and	rbx,QWORD[16+rsp]
+	jnz	NEAR $L$beeu_shift_loop_end_YA
+
+
+	mov	rbx,1
+	and	rbx,r12
+	jz	NEAR $L$shift1_1
+	add	r12,QWORD[rdx]
+	adc	r13,QWORD[8+rdx]
+	adc	r14,QWORD[16+rdx]
+	adc	r15,QWORD[24+rdx]
+	adc	rbp,0
+
+$L$shift1_1:
+	shrd	r12,r13,1
+	shrd	r13,r14,1
+	shrd	r14,r15,1
+	shrd	r15,rbp,1
+	shr	rbp,1
+
+	shl	rcx,1
+
+
+
+
+
+	cmp	rcx,0x8000000
+	jne	NEAR $L$beeu_shift_loop_YA
+
+$L$beeu_shift_loop_end_YA:
+	bsf	rcx,rcx
+	test	rcx,rcx
+	jz	NEAR $L$beeu_no_shift_YA
+
+
+
+	mov	rax,QWORD[((8+16))+rsp]
+	mov	rbx,QWORD[((16+16))+rsp]
+	mov	rsi,QWORD[((24+16))+rsp]
+
+	shrd	QWORD[((0+16))+rsp],rax,cl
+	shrd	QWORD[((8+16))+rsp],rbx,cl
+	shrd	QWORD[((16+16))+rsp],rsi,cl
+
+	shr	rsi,cl
+	mov	QWORD[((24+16))+rsp],rsi
+
+
+$L$beeu_no_shift_YA:
+
+	mov	rax,QWORD[48+rsp]
+	mov	rbx,QWORD[56+rsp]
+	mov	rsi,QWORD[64+rsp]
+	mov	rcx,QWORD[72+rsp]
+	sub	rax,QWORD[16+rsp]
+	sbb	rbx,QWORD[24+rsp]
+	sbb	rsi,QWORD[32+rsp]
+	sbb	rcx,QWORD[40+rsp]
+	jnc	NEAR $L$beeu_B_bigger_than_A
+
+
+	mov	rax,QWORD[16+rsp]
+	mov	rbx,QWORD[24+rsp]
+	mov	rsi,QWORD[32+rsp]
+	mov	rcx,QWORD[40+rsp]
+	sub	rax,QWORD[48+rsp]
+	sbb	rbx,QWORD[56+rsp]
+	sbb	rsi,QWORD[64+rsp]
+	sbb	rcx,QWORD[72+rsp]
+	mov	QWORD[16+rsp],rax
+	mov	QWORD[24+rsp],rbx
+	mov	QWORD[32+rsp],rsi
+	mov	QWORD[40+rsp],rcx
+
+
+	add	r12,r8
+	adc	r13,r9
+	adc	r14,r10
+	adc	r15,r11
+	adc	rbp,rdi
+	jmp	NEAR $L$beeu_loop
+
+$L$beeu_B_bigger_than_A:
+
+	mov	QWORD[48+rsp],rax
+	mov	QWORD[56+rsp],rbx
+	mov	QWORD[64+rsp],rsi
+	mov	QWORD[72+rsp],rcx
+
+
+	add	r8,r12
+	adc	r9,r13
+	adc	r10,r14
+	adc	r11,r15
+	adc	rdi,rbp
+
+	jmp	NEAR $L$beeu_loop
+
+$L$beeu_loop_end:
+
+
+
+
+	mov	rbx,QWORD[16+rsp]
+	sub	rbx,1
+	or	rbx,QWORD[24+rsp]
+	or	rbx,QWORD[32+rsp]
+	or	rbx,QWORD[40+rsp]
+
+	jnz	NEAR $L$beeu_err
+
+
+
+
+	mov	r8,QWORD[rdx]
+	mov	r9,QWORD[8+rdx]
+	mov	r10,QWORD[16+rdx]
+	mov	r11,QWORD[24+rdx]
+	xor	rdi,rdi
+
+$L$beeu_reduction_loop:
+	mov	QWORD[16+rsp],r12
+	mov	QWORD[24+rsp],r13
+	mov	QWORD[32+rsp],r14
+	mov	QWORD[40+rsp],r15
+	mov	QWORD[48+rsp],rbp
+
+
+	sub	r12,r8
+	sbb	r13,r9
+	sbb	r14,r10
+	sbb	r15,r11
+	sbb	rbp,0
+
+
+	cmovc	r12,QWORD[16+rsp]
+	cmovc	r13,QWORD[24+rsp]
+	cmovc	r14,QWORD[32+rsp]
+	cmovc	r15,QWORD[40+rsp]
+	jnc	NEAR $L$beeu_reduction_loop
+
+
+	sub	r8,r12
+	sbb	r9,r13
+	sbb	r10,r14
+	sbb	r11,r15
+
+$L$beeu_save:
+
+	mov	rdi,QWORD[rsp]
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],r9
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+
+
+	mov	rax,1
+	jmp	NEAR $L$beeu_finish
+
+$L$beeu_err:
+
+	xor	rax,rax
+
+$L$beeu_finish:
+	add	rsp,80
+
+	pop	rsi
+
+	pop	rbx
+
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+
+$L$SEH_end_beeu_mod_inverse_vartime:
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/rdrand-x86_64-apple.S b/gen/bcm/rdrand-x86_64-apple.S
new file mode 100644
index 0000000..5fdf105
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-apple.S
@@ -0,0 +1,57 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+
+.globl	_CRYPTO_rdrand
+.private_extern _CRYPTO_rdrand
+
+.p2align	4
+_CRYPTO_rdrand:
+
+_CET_ENDBR
+	xorq	%rax,%rax
+.byte	72,15,199,242
+
+	adcq	%rax,%rax
+	movq	%rdx,0(%rdi)
+	ret
+
+
+
+
+
+
+
+.globl	_CRYPTO_rdrand_multiple8_buf
+.private_extern _CRYPTO_rdrand_multiple8_buf
+
+.p2align	4
+_CRYPTO_rdrand_multiple8_buf:
+
+_CET_ENDBR
+	testq	%rsi,%rsi
+	jz	L$out
+	movq	$8,%rdx
+L$loop:
+.byte	72,15,199,241
+	jnc	L$err
+	movq	%rcx,0(%rdi)
+	addq	%rdx,%rdi
+	subq	%rdx,%rsi
+	jnz	L$loop
+L$out:
+	movq	$1,%rax
+	ret
+L$err:
+	xorq	%rax,%rax
+	ret
+
+
+#endif
diff --git a/gen/bcm/rdrand-x86_64-linux.S b/gen/bcm/rdrand-x86_64-linux.S
new file mode 100644
index 0000000..fe81dac
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-linux.S
@@ -0,0 +1,57 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+
+
+.globl	CRYPTO_rdrand
+.hidden CRYPTO_rdrand
+.type	CRYPTO_rdrand,@function
+.align	16
+CRYPTO_rdrand:
+.cfi_startproc	
+_CET_ENDBR
+	xorq	%rax,%rax
+.byte	72,15,199,242
+
+	adcq	%rax,%rax
+	movq	%rdx,0(%rdi)
+	ret
+.cfi_endproc	
+.size	CRYPTO_rdrand,.-CRYPTO_rdrand
+
+
+
+
+
+.globl	CRYPTO_rdrand_multiple8_buf
+.hidden CRYPTO_rdrand_multiple8_buf
+.type	CRYPTO_rdrand_multiple8_buf,@function
+.align	16
+CRYPTO_rdrand_multiple8_buf:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%rsi,%rsi
+	jz	.Lout
+	movq	$8,%rdx
+.Lloop:
+.byte	72,15,199,241
+	jnc	.Lerr
+	movq	%rcx,0(%rdi)
+	addq	%rdx,%rdi
+	subq	%rdx,%rsi
+	jnz	.Lloop
+.Lout:
+	movq	$1,%rax
+	ret
+.Lerr:
+	xorq	%rax,%rax
+	ret
+.cfi_endproc	
+.size	CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf
+#endif
diff --git a/gen/bcm/rdrand-x86_64-win.asm b/gen/bcm/rdrand-x86_64-win.asm
new file mode 100644
index 0000000..aae3d76
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-win.asm
@@ -0,0 +1,66 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+
+
+global	CRYPTO_rdrand
+
+ALIGN	16
+CRYPTO_rdrand:
+
+_CET_ENDBR
+	xor	rax,rax
+DB	73,15,199,240
+
+	adc	rax,rax
+	mov	QWORD[rcx],r8
+	ret
+
+
+
+
+
+
+
+global	CRYPTO_rdrand_multiple8_buf
+
+ALIGN	16
+CRYPTO_rdrand_multiple8_buf:
+
+_CET_ENDBR
+	test	rdx,rdx
+	jz	NEAR $L$out
+	mov	r8,8
+$L$loop:
+DB	73,15,199,241
+	jnc	NEAR $L$err
+	mov	QWORD[rcx],r9
+	add	rcx,r8
+	sub	rdx,r8
+	jnz	NEAR $L$loop
+$L$out:
+	mov	rax,1
+	ret
+$L$err:
+	xor	rax,rax
+	ret
+
+
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/rsaz-avx2-apple.S b/gen/bcm/rsaz-avx2-apple.S
new file mode 100644
index 0000000..3672309
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-apple.S
@@ -0,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_rsaz_1024_sqr_avx2
+.private_extern _rsaz_1024_sqr_avx2
+
+.p2align	6
+_rsaz_1024_sqr_avx2:
+
+_CET_ENDBR
+	leaq	(%rsp),%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	vzeroupper
+	movq	%rax,%rbp
+
+	movq	%rdx,%r13
+	subq	$832,%rsp
+	movq	%r13,%r15
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	subq	$-128,%r13
+
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	vpxor	%ymm9,%ymm9,%ymm9
+	jz	L$sqr_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%r13),%ymm0
+	andq	$-2048,%rsp
+	vmovdqu	32-128(%r13),%ymm1
+	vmovdqu	64-128(%r13),%ymm2
+	vmovdqu	96-128(%r13),%ymm3
+	vmovdqu	128-128(%r13),%ymm4
+	vmovdqu	160-128(%r13),%ymm5
+	vmovdqu	192-128(%r13),%ymm6
+	vmovdqu	224-128(%r13),%ymm7
+	vmovdqu	256-128(%r13),%ymm8
+	leaq	832+128(%rsp),%r13
+	vmovdqu	%ymm0,0-128(%r13)
+	vmovdqu	%ymm1,32-128(%r13)
+	vmovdqu	%ymm2,64-128(%r13)
+	vmovdqu	%ymm3,96-128(%r13)
+	vmovdqu	%ymm4,128-128(%r13)
+	vmovdqu	%ymm5,160-128(%r13)
+	vmovdqu	%ymm6,192-128(%r13)
+	vmovdqu	%ymm7,224-128(%r13)
+	vmovdqu	%ymm8,256-128(%r13)
+	vmovdqu	%ymm9,288-128(%r13)
+
+L$sqr_1024_no_n_copy:
+	andq	$-1024,%rsp
+
+	vmovdqu	32-128(%rsi),%ymm1
+	vmovdqu	64-128(%rsi),%ymm2
+	vmovdqu	96-128(%rsi),%ymm3
+	vmovdqu	128-128(%rsi),%ymm4
+	vmovdqu	160-128(%rsi),%ymm5
+	vmovdqu	192-128(%rsi),%ymm6
+	vmovdqu	224-128(%rsi),%ymm7
+	vmovdqu	256-128(%rsi),%ymm8
+
+	leaq	192(%rsp),%rbx
+	vmovdqu	L$and_mask(%rip),%ymm15
+	jmp	L$OOP_GRANDE_SQR_1024
+
+.p2align	5
+L$OOP_GRANDE_SQR_1024:
+	leaq	576+128(%rsp),%r9
+	leaq	448(%rsp),%r12
+
+
+
+
+	vpaddq	%ymm1,%ymm1,%ymm1
+	vpbroadcastq	0-128(%rsi),%ymm10
+	vpaddq	%ymm2,%ymm2,%ymm2
+	vmovdqa	%ymm1,0-128(%r9)
+	vpaddq	%ymm3,%ymm3,%ymm3
+	vmovdqa	%ymm2,32-128(%r9)
+	vpaddq	%ymm4,%ymm4,%ymm4
+	vmovdqa	%ymm3,64-128(%r9)
+	vpaddq	%ymm5,%ymm5,%ymm5
+	vmovdqa	%ymm4,96-128(%r9)
+	vpaddq	%ymm6,%ymm6,%ymm6
+	vmovdqa	%ymm5,128-128(%r9)
+	vpaddq	%ymm7,%ymm7,%ymm7
+	vmovdqa	%ymm6,160-128(%r9)
+	vpaddq	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm7,192-128(%r9)
+	vpxor	%ymm9,%ymm9,%ymm9
+	vmovdqa	%ymm8,224-128(%r9)
+
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpbroadcastq	32-128(%rsi),%ymm11
+	vmovdqu	%ymm9,288-192(%rbx)
+	vpmuludq	%ymm10,%ymm1,%ymm1
+	vmovdqu	%ymm9,320-448(%r12)
+	vpmuludq	%ymm10,%ymm2,%ymm2
+	vmovdqu	%ymm9,352-448(%r12)
+	vpmuludq	%ymm10,%ymm3,%ymm3
+	vmovdqu	%ymm9,384-448(%r12)
+	vpmuludq	%ymm10,%ymm4,%ymm4
+	vmovdqu	%ymm9,416-448(%r12)
+	vpmuludq	%ymm10,%ymm5,%ymm5
+	vmovdqu	%ymm9,448-448(%r12)
+	vpmuludq	%ymm10,%ymm6,%ymm6
+	vmovdqu	%ymm9,480-448(%r12)
+	vpmuludq	%ymm10,%ymm7,%ymm7
+	vmovdqu	%ymm9,512-448(%r12)
+	vpmuludq	%ymm10,%ymm8,%ymm8
+	vpbroadcastq	64-128(%rsi),%ymm10
+	vmovdqu	%ymm9,544-448(%r12)
+
+	movq	%rsi,%r15
+	movl	$4,%r14d
+	jmp	L$sqr_entry_1024
+.p2align	5
+L$OOP_SQR_1024:
+	vpbroadcastq	32-128(%r15),%ymm11
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpaddq	0-192(%rbx),%ymm0,%ymm0
+	vpmuludq	0-128(%r9),%ymm10,%ymm1
+	vpaddq	32-192(%rbx),%ymm1,%ymm1
+	vpmuludq	32-128(%r9),%ymm10,%ymm2
+	vpaddq	64-192(%rbx),%ymm2,%ymm2
+	vpmuludq	64-128(%r9),%ymm10,%ymm3
+	vpaddq	96-192(%rbx),%ymm3,%ymm3
+	vpmuludq	96-128(%r9),%ymm10,%ymm4
+	vpaddq	128-192(%rbx),%ymm4,%ymm4
+	vpmuludq	128-128(%r9),%ymm10,%ymm5
+	vpaddq	160-192(%rbx),%ymm5,%ymm5
+	vpmuludq	160-128(%r9),%ymm10,%ymm6
+	vpaddq	192-192(%rbx),%ymm6,%ymm6
+	vpmuludq	192-128(%r9),%ymm10,%ymm7
+	vpaddq	224-192(%rbx),%ymm7,%ymm7
+	vpmuludq	224-128(%r9),%ymm10,%ymm8
+	vpbroadcastq	64-128(%r15),%ymm10
+	vpaddq	256-192(%rbx),%ymm8,%ymm8
+L$sqr_entry_1024:
+	vmovdqu	%ymm0,0-192(%rbx)
+	vmovdqu	%ymm1,32-192(%rbx)
+
+	vpmuludq	32-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	32-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	64-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	96-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	128-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	160-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	192-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	224-128(%r9),%ymm11,%ymm0
+	vpbroadcastq	96-128(%r15),%ymm11
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+
+	vmovdqu	%ymm2,64-192(%rbx)
+	vmovdqu	%ymm3,96-192(%rbx)
+
+	vpmuludq	64-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	64-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	96-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	128-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	160-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	224-128(%r9),%ymm10,%ymm1
+	vpbroadcastq	128-128(%r15),%ymm10
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+
+	vmovdqu	%ymm4,128-192(%rbx)
+	vmovdqu	%ymm5,160-192(%rbx)
+
+	vpmuludq	96-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	96-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	128-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm0,%ymm0
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	224-128(%r9),%ymm11,%ymm2
+	vpbroadcastq	160-128(%r15),%ymm11
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+
+	vmovdqu	%ymm6,192-192(%rbx)
+	vmovdqu	%ymm7,224-192(%rbx)
+
+	vpmuludq	128-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	128-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	160-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	192-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	224-128(%r9),%ymm10,%ymm3
+	vpbroadcastq	192-128(%r15),%ymm10
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+
+	vmovdqu	%ymm8,256-192(%rbx)
+	vmovdqu	%ymm0,288-192(%rbx)
+	leaq	8(%rbx),%rbx
+
+	vpmuludq	160-128(%rsi),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	224-128(%r9),%ymm11,%ymm4
+	vpbroadcastq	224-128(%r15),%ymm11
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+
+	vmovdqu	%ymm1,320-448(%r12)
+	vmovdqu	%ymm2,352-448(%r12)
+
+	vpmuludq	192-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpbroadcastq	256-128(%r15),%ymm0
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	224-128(%r9),%ymm10,%ymm5
+	vpbroadcastq	0+8-128(%r15),%ymm10
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+
+	vmovdqu	%ymm3,384-448(%r12)
+	vmovdqu	%ymm4,416-448(%r12)
+	leaq	8(%r15),%r15
+
+	vpmuludq	224-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	224-128(%r9),%ymm11,%ymm6
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+
+	vpmuludq	256-128(%rsi),%ymm0,%ymm7
+	vmovdqu	%ymm5,448-448(%r12)
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vmovdqu	%ymm6,480-448(%r12)
+	vmovdqu	%ymm7,512-448(%r12)
+	leaq	8(%r12),%r12
+
+	decl	%r14d
+	jnz	L$OOP_SQR_1024
+
+	vmovdqu	256(%rsp),%ymm8
+	vmovdqu	288(%rsp),%ymm1
+	vmovdqu	320(%rsp),%ymm2
+	leaq	192(%rsp),%rbx
+
+	vpsrlq	$29,%ymm8,%ymm14
+	vpand	%ymm15,%ymm8,%ymm8
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+
+	vpermq	$0x93,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm9,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpblendd	$3,%ymm11,%ymm9,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,288-192(%rbx)
+	vmovdqu	%ymm2,320-192(%rbx)
+
+	movq	(%rsp),%rax
+	movq	8(%rsp),%r10
+	movq	16(%rsp),%r11
+	movq	24(%rsp),%r12
+	vmovdqu	32(%rsp),%ymm1
+	vmovdqu	64-192(%rbx),%ymm2
+	vmovdqu	96-192(%rbx),%ymm3
+	vmovdqu	128-192(%rbx),%ymm4
+	vmovdqu	160-192(%rbx),%ymm5
+	vmovdqu	192-192(%rbx),%ymm6
+	vmovdqu	224-192(%rbx),%ymm7
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpbroadcastq	%xmm12,%ymm12
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	shrq	$29,%r9
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	addq	%r9,%r10
+	addq	%rax,%r11
+	imulq	24-128(%r13),%rdx
+	addq	%rdx,%r12
+
+	movq	%r10,%rax
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+
+	movl	$9,%r14d
+	jmp	L$OOP_REDUCE_1024
+
+.p2align	5
+L$OOP_REDUCE_1024:
+	vmovd	%eax,%xmm13
+	vpbroadcastq	%xmm13,%ymm13
+
+	vpmuludq	32-128(%r13),%ymm12,%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm10,%ymm1,%ymm1
+	addq	%rax,%r10
+	vpmuludq	64-128(%r13),%ymm12,%ymm14
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm14,%ymm2,%ymm2
+	vpmuludq	96-128(%r13),%ymm12,%ymm11
+.byte	0x67
+	addq	%rax,%r11
+.byte	0x67
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	shrq	$29,%r10
+	vpaddq	%ymm11,%ymm3,%ymm3
+	vpmuludq	128-128(%r13),%ymm12,%ymm10
+	addq	%rax,%r12
+	addq	%r10,%r11
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpmuludq	160-128(%r13),%ymm12,%ymm14
+	movq	%r11,%rax
+	imull	%ecx,%eax
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpmuludq	192-128(%r13),%ymm12,%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpmuludq	224-128(%r13),%ymm12,%ymm10
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpmuludq	256-128(%r13),%ymm12,%ymm14
+	vmovd	%eax,%xmm12
+
+	vpaddq	%ymm14,%ymm8,%ymm8
+
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	32-8-128(%r13),%ymm13,%ymm11
+	vmovdqu	96-8-128(%r13),%ymm14
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm1,%ymm1
+	vpmuludq	64-8-128(%r13),%ymm13,%ymm10
+	vmovdqu	128-8-128(%r13),%ymm11
+	addq	%rax,%r11
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm10,%ymm2,%ymm2
+	addq	%r12,%rax
+	shrq	$29,%r11
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	160-8-128(%r13),%ymm10
+	addq	%r11,%rax
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	192-8-128(%r13),%ymm14
+.byte	0x67
+	movq	%rax,%r12
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm4,%ymm4
+	vpmuludq	%ymm13,%ymm10,%ymm10
+.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm5,%ymm5
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	256-8-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	288-8-128(%r13),%ymm9
+	vmovd	%eax,%xmm0
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm7,%ymm7
+	vpmuludq	%ymm13,%ymm10,%ymm10
+	vmovdqu	32-16-128(%r13),%ymm14
+	vpbroadcastq	%xmm0,%ymm0
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpmuludq	%ymm13,%ymm9,%ymm9
+	vmovdqu	64-16-128(%r13),%ymm11
+	addq	%rax,%r12
+
+	vmovdqu	32-24-128(%r13),%ymm13
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	96-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	%ymm0,%ymm13,%ymm13
+	vpmuludq	%ymm12,%ymm11,%ymm11
+.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+	vpaddq	%ymm1,%ymm13,%ymm13
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	160-16-128(%r13),%ymm11
+.byte	0x67
+	vmovq	%xmm13,%rax
+	vmovdqu	%ymm13,(%rsp)
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	192-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	%ymm12,%ymm11,%ymm11
+	vmovdqu	224-16-128(%r13),%ymm14
+	vpaddq	%ymm11,%ymm5,%ymm5
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	256-16-128(%r13),%ymm11
+	vpaddq	%ymm10,%ymm6,%ymm6
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	shrq	$29,%r12
+	vmovdqu	288-16-128(%r13),%ymm10
+	addq	%r12,%rax
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	%ymm12,%ymm11,%ymm11
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm8,%ymm8
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+	vmovdqu	96-24-128(%r13),%ymm11
+.byte	0x67
+	vpaddq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	64-24-128(%r13),%ymm0,%ymm14
+	vmovdqu	128-24-128(%r13),%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	movq	8(%rsp),%r10
+	vpaddq	%ymm14,%ymm2,%ymm1
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	160-24-128(%r13),%ymm14
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+.byte	0x67
+	shrq	$29,%r9
+	movq	16(%rsp),%r11
+	vpaddq	%ymm11,%ymm3,%ymm2
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vmovdqu	192-24-128(%r13),%ymm11
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	vpaddq	%ymm10,%ymm4,%ymm3
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	vmovdqu	224-24-128(%r13),%ymm10
+	imulq	24-128(%r13),%rdx
+	addq	%rax,%r11
+	leaq	(%r9,%r10,1),%rax
+	vpaddq	%ymm14,%ymm5,%ymm4
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	256-24-128(%r13),%ymm14
+	movq	%rax,%r10
+	imull	%ecx,%eax
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vpaddq	%ymm11,%ymm6,%ymm5
+	vmovdqu	288-24-128(%r13),%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm7,%ymm6
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	addq	24(%rsp),%rdx
+	vpaddq	%ymm14,%ymm8,%ymm7
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vpaddq	%ymm11,%ymm9,%ymm8
+	vmovq	%r12,%xmm9
+	movq	%rdx,%r12
+
+	decl	%r14d
+	jnz	L$OOP_REDUCE_1024
+	leaq	448(%rsp),%r12
+	vpaddq	%ymm9,%ymm13,%ymm0
+	vpxor	%ymm9,%ymm9,%ymm9
+
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vpaddq	544-448(%r12),%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vmovdqu	%ymm0,0-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,32-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vmovdqu	%ymm2,64-128(%rdi)
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vmovdqu	%ymm4,128-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vmovdqu	%ymm5,160-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vmovdqu	%ymm6,192-128(%rdi)
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+
+	movq	%rdi,%rsi
+	decl	%r8d
+	jne	L$OOP_GRANDE_SQR_1024
+
+	vzeroall
+	movq	%rbp,%rax
+
+	movq	-48(%rax),%r15
+
+	movq	-40(%rax),%r14
+
+	movq	-32(%rax),%r13
+
+	movq	-24(%rax),%r12
+
+	movq	-16(%rax),%rbp
+
+	movq	-8(%rax),%rbx
+
+	leaq	(%rax),%rsp
+
+L$sqr_1024_epilogue:
+	ret
+
+
+.globl	_rsaz_1024_mul_avx2
+.private_extern _rsaz_1024_mul_avx2
+
+.p2align	6
+_rsaz_1024_mul_avx2:
+
+_CET_ENDBR
+	leaq	(%rsp),%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	movq	%rax,%rbp
+
+	vzeroall
+	movq	%rdx,%r13
+	subq	$64,%rsp
+
+
+
+
+
+
+.byte	0x67,0x67
+	movq	%rsi,%r15
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	movq	%rsi,%r15
+	cmovnzq	%r13,%rsi
+	cmovnzq	%r15,%r13
+
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	subq	$-128,%rcx
+	subq	$-128,%rdi
+
+	andq	$4095,%r15
+	addq	$320,%r15
+.byte	0x67,0x67
+	shrq	$12,%r15
+	jz	L$mul_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%rcx),%ymm0
+	andq	$-512,%rsp
+	vmovdqu	32-128(%rcx),%ymm1
+	vmovdqu	64-128(%rcx),%ymm2
+	vmovdqu	96-128(%rcx),%ymm3
+	vmovdqu	128-128(%rcx),%ymm4
+	vmovdqu	160-128(%rcx),%ymm5
+	vmovdqu	192-128(%rcx),%ymm6
+	vmovdqu	224-128(%rcx),%ymm7
+	vmovdqu	256-128(%rcx),%ymm8
+	leaq	64+128(%rsp),%rcx
+	vmovdqu	%ymm0,0-128(%rcx)
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm1,32-128(%rcx)
+	vpxor	%ymm1,%ymm1,%ymm1
+	vmovdqu	%ymm2,64-128(%rcx)
+	vpxor	%ymm2,%ymm2,%ymm2
+	vmovdqu	%ymm3,96-128(%rcx)
+	vpxor	%ymm3,%ymm3,%ymm3
+	vmovdqu	%ymm4,128-128(%rcx)
+	vpxor	%ymm4,%ymm4,%ymm4
+	vmovdqu	%ymm5,160-128(%rcx)
+	vpxor	%ymm5,%ymm5,%ymm5
+	vmovdqu	%ymm6,192-128(%rcx)
+	vpxor	%ymm6,%ymm6,%ymm6
+	vmovdqu	%ymm7,224-128(%rcx)
+	vpxor	%ymm7,%ymm7,%ymm7
+	vmovdqu	%ymm8,256-128(%rcx)
+	vmovdqa	%ymm0,%ymm8
+	vmovdqu	%ymm9,288-128(%rcx)
+L$mul_1024_no_n_copy:
+	andq	$-64,%rsp
+
+	movq	(%r13),%rbx
+	vpbroadcastq	(%r13),%ymm10
+	vmovdqu	%ymm0,(%rsp)
+	xorq	%r9,%r9
+.byte	0x67
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+
+	vmovdqu	L$and_mask(%rip),%ymm15
+	movl	$9,%r14d
+	vmovdqu	%ymm9,288-128(%rdi)
+	jmp	L$oop_mul_1024
+
+.p2align	5
+L$oop_mul_1024:
+	vpsrlq	$29,%ymm3,%ymm9
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r9,%rax
+	movq	%rbx,%r10
+	imulq	8-128(%rsi),%r10
+	addq	8(%rsp),%r10
+
+	movq	%rax,%r9
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	movq	%rbx,%r11
+	imulq	16-128(%rsi),%r11
+	addq	16(%rsp),%r11
+
+	movq	%rbx,%r12
+	imulq	24-128(%rsi),%r12
+	addq	24(%rsp),%r12
+	vpmuludq	32-128(%rsi),%ymm10,%ymm0
+	vmovd	%eax,%xmm11
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	64-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	96-128(%rsi),%ymm10,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	128-128(%rsi),%ymm10,%ymm0
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	160-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	192-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	224-128(%rsi),%ymm10,%ymm0
+	vpermq	$0x93,%ymm9,%ymm9
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	256-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	8(%r13),%ymm10
+	vpaddq	%ymm12,%ymm8,%ymm8
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%rcx),%rax
+	addq	%rax,%r11
+	shrq	$29,%r9
+	imulq	24-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r9,%r10
+
+	vpmuludq	32-128(%rcx),%ymm11,%ymm13
+	vmovq	%xmm10,%rbx
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	64-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm2,%ymm2
+	vpmuludq	96-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	128-128(%rcx),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	160-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm5,%ymm5
+	vpmuludq	192-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	224-128(%rcx),%ymm11,%ymm13
+	vpblendd	$3,%ymm14,%ymm9,%ymm12
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	256-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm0,%ymm8,%ymm8
+
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rsi),%ymm12
+	movq	%rbx,%rax
+	imulq	8-128(%rsi),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rsi),%ymm13
+
+	movq	%r10,%rax
+	vpblendd	$0xfc,%ymm14,%ymm9,%ymm9
+	imull	%r8d,%eax
+	vpaddq	%ymm9,%ymm4,%ymm4
+	andl	$0x1fffffff,%eax
+
+	imulq	16-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovd	%eax,%xmm11
+	vmovdqu	-8+96-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-8+128-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+160-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+192-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-8+224-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+256-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+288-128(%rsi),%ymm9
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	16(%r13),%ymm10
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rcx),%ymm0
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rcx),%ymm12
+	shrq	$29,%r10
+	imulq	16-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r10,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-8+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rsi),%ymm0
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r11,%rax
+
+	vmovdqu	-16+64-128(%rsi),%ymm12
+	movq	%rax,%r11
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	imulq	8-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-16+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-16+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	24(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rcx),%ymm0
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-16+64-128(%rcx),%ymm12
+	imulq	8-128(%rcx),%rdx
+	addq	%rdx,%r12
+	shrq	$29,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-16+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+32-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+64-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	addq	%r11,%r12
+	imulq	-128(%rsi),%rbx
+	addq	%rbx,%r12
+
+	movq	%r12,%rax
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-24+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-24+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	32(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+	addq	$32,%r13
+
+	vmovdqu	-24+32-128(%rcx),%ymm0
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r12
+	shrq	$29,%r12
+
+	vmovdqu	-24+64-128(%rcx),%ymm12
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-24+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm0
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	%ymm0,(%rsp)
+	vpaddq	%ymm12,%ymm2,%ymm1
+	vmovdqu	-24+128-128(%rcx),%ymm0
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm2
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm3
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm4
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm5
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rcx),%ymm13
+	movq	%r12,%r9
+	vpaddq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	addq	(%rsp),%r9
+	vpaddq	%ymm12,%ymm8,%ymm7
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovq	%r12,%xmm12
+	vpaddq	%ymm13,%ymm9,%ymm8
+
+	decl	%r14d
+	jnz	L$oop_mul_1024
+	vpaddq	(%rsp),%ymm12,%ymm0
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm10,%ymm10
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpermq	$0x93,%ymm11,%ymm11
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm10,%ymm10
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0-128(%rdi)
+	vmovdqu	%ymm1,32-128(%rdi)
+	vmovdqu	%ymm2,64-128(%rdi)
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vmovdqu	%ymm4,128-128(%rdi)
+	vmovdqu	%ymm5,160-128(%rdi)
+	vmovdqu	%ymm6,192-128(%rdi)
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+	vzeroupper
+
+	movq	%rbp,%rax
+
+	movq	-48(%rax),%r15
+
+	movq	-40(%rax),%r14
+
+	movq	-32(%rax),%r13
+
+	movq	-24(%rax),%r12
+
+	movq	-16(%rax),%rbp
+
+	movq	-8(%rax),%rbx
+
+	leaq	(%rax),%rsp
+
+L$mul_1024_epilogue:
+	ret
+
+
+.globl	_rsaz_1024_red2norm_avx2
+.private_extern _rsaz_1024_red2norm_avx2
+
+.p2align	5
+_rsaz_1024_red2norm_avx2:
+
+_CET_ENDBR
+	subq	$-128,%rsi
+	xorq	%rax,%rax
+	movq	-128(%rsi),%r8
+	movq	-120(%rsi),%r9
+	movq	-112(%rsi),%r10
+	shlq	$0,%r8
+	shlq	$29,%r9
+	movq	%r10,%r11
+	shlq	$58,%r10
+	shrq	$6,%r11
+	addq	%r8,%rax
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,0(%rdi)
+	movq	%r11,%rax
+	movq	-104(%rsi),%r8
+	movq	-96(%rsi),%r9
+	shlq	$23,%r8
+	movq	%r9,%r10
+	shlq	$52,%r9
+	shrq	$12,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,8(%rdi)
+	movq	%r10,%rax
+	movq	-88(%rsi),%r11
+	movq	-80(%rsi),%r8
+	shlq	$17,%r11
+	movq	%r8,%r9
+	shlq	$46,%r8
+	shrq	$18,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,16(%rdi)
+	movq	%r9,%rax
+	movq	-72(%rsi),%r10
+	movq	-64(%rsi),%r11
+	shlq	$11,%r10
+	movq	%r11,%r8
+	shlq	$40,%r11
+	shrq	$24,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,24(%rdi)
+	movq	%r8,%rax
+	movq	-56(%rsi),%r9
+	movq	-48(%rsi),%r10
+	movq	-40(%rsi),%r11
+	shlq	$5,%r9
+	shlq	$34,%r10
+	movq	%r11,%r8
+	shlq	$63,%r11
+	shrq	$1,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,32(%rdi)
+	movq	%r8,%rax
+	movq	-32(%rsi),%r9
+	movq	-24(%rsi),%r10
+	shlq	$28,%r9
+	movq	%r10,%r11
+	shlq	$57,%r10
+	shrq	$7,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,40(%rdi)
+	movq	%r11,%rax
+	movq	-16(%rsi),%r8
+	movq	-8(%rsi),%r9
+	shlq	$22,%r8
+	movq	%r9,%r10
+	shlq	$51,%r9
+	shrq	$13,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,48(%rdi)
+	movq	%r10,%rax
+	movq	0(%rsi),%r11
+	movq	8(%rsi),%r8
+	shlq	$16,%r11
+	movq	%r8,%r9
+	shlq	$45,%r8
+	shrq	$19,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,56(%rdi)
+	movq	%r9,%rax
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	shlq	$10,%r10
+	movq	%r11,%r8
+	shlq	$39,%r11
+	shrq	$25,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,64(%rdi)
+	movq	%r8,%rax
+	movq	32(%rsi),%r9
+	movq	40(%rsi),%r10
+	movq	48(%rsi),%r11
+	shlq	$4,%r9
+	shlq	$33,%r10
+	movq	%r11,%r8
+	shlq	$62,%r11
+	shrq	$2,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,72(%rdi)
+	movq	%r8,%rax
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	shlq	$27,%r9
+	movq	%r10,%r11
+	shlq	$56,%r10
+	shrq	$8,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,80(%rdi)
+	movq	%r11,%rax
+	movq	72(%rsi),%r8
+	movq	80(%rsi),%r9
+	shlq	$21,%r8
+	movq	%r9,%r10
+	shlq	$50,%r9
+	shrq	$14,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,88(%rdi)
+	movq	%r10,%rax
+	movq	88(%rsi),%r11
+	movq	96(%rsi),%r8
+	shlq	$15,%r11
+	movq	%r8,%r9
+	shlq	$44,%r8
+	shrq	$20,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,96(%rdi)
+	movq	%r9,%rax
+	movq	104(%rsi),%r10
+	movq	112(%rsi),%r11
+	shlq	$9,%r10
+	movq	%r11,%r8
+	shlq	$38,%r11
+	shrq	$26,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,104(%rdi)
+	movq	%r8,%rax
+	movq	120(%rsi),%r9
+	movq	128(%rsi),%r10
+	movq	136(%rsi),%r11
+	shlq	$3,%r9
+	shlq	$32,%r10
+	movq	%r11,%r8
+	shlq	$61,%r11
+	shrq	$3,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,112(%rdi)
+	movq	%r8,%rax
+	movq	144(%rsi),%r9
+	movq	152(%rsi),%r10
+	shlq	$26,%r9
+	movq	%r10,%r11
+	shlq	$55,%r10
+	shrq	$9,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,120(%rdi)
+	movq	%r11,%rax
+	ret
+
+
+
+.globl	_rsaz_1024_norm2red_avx2
+.private_extern _rsaz_1024_norm2red_avx2
+
+.p2align	5
+_rsaz_1024_norm2red_avx2:
+
+_CET_ENDBR
+	subq	$-128,%rdi
+	movq	(%rsi),%r8
+	movl	$0x1fffffff,%eax
+	movq	8(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$0,%r11
+	andq	%rax,%r11
+	movq	%r11,-128(%rdi)
+	movq	%r8,%r10
+	shrq	$29,%r10
+	andq	%rax,%r10
+	movq	%r10,-120(%rdi)
+	shrdq	$58,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-112(%rdi)
+	movq	16(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$23,%r8
+	andq	%rax,%r8
+	movq	%r8,-104(%rdi)
+	shrdq	$52,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-96(%rdi)
+	movq	24(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$17,%r9
+	andq	%rax,%r9
+	movq	%r9,-88(%rdi)
+	shrdq	$46,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-80(%rdi)
+	movq	32(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$11,%r10
+	andq	%rax,%r10
+	movq	%r10,-72(%rdi)
+	shrdq	$40,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,-64(%rdi)
+	movq	40(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$5,%r11
+	andq	%rax,%r11
+	movq	%r11,-56(%rdi)
+	movq	%r8,%r10
+	shrq	$34,%r10
+	andq	%rax,%r10
+	movq	%r10,-48(%rdi)
+	shrdq	$63,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-40(%rdi)
+	movq	48(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$28,%r8
+	andq	%rax,%r8
+	movq	%r8,-32(%rdi)
+	shrdq	$57,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-24(%rdi)
+	movq	56(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$22,%r9
+	andq	%rax,%r9
+	movq	%r9,-16(%rdi)
+	shrdq	$51,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-8(%rdi)
+	movq	64(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$16,%r10
+	andq	%rax,%r10
+	movq	%r10,0(%rdi)
+	shrdq	$45,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,8(%rdi)
+	movq	72(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$10,%r11
+	andq	%rax,%r11
+	movq	%r11,16(%rdi)
+	shrdq	$39,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,24(%rdi)
+	movq	80(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$4,%r8
+	andq	%rax,%r8
+	movq	%r8,32(%rdi)
+	movq	%r9,%r11
+	shrq	$33,%r11
+	andq	%rax,%r11
+	movq	%r11,40(%rdi)
+	shrdq	$62,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,48(%rdi)
+	movq	88(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$27,%r9
+	andq	%rax,%r9
+	movq	%r9,56(%rdi)
+	shrdq	$56,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,64(%rdi)
+	movq	96(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$21,%r10
+	andq	%rax,%r10
+	movq	%r10,72(%rdi)
+	shrdq	$50,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,80(%rdi)
+	movq	104(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$15,%r11
+	andq	%rax,%r11
+	movq	%r11,88(%rdi)
+	shrdq	$44,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,96(%rdi)
+	movq	112(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$9,%r8
+	andq	%rax,%r8
+	movq	%r8,104(%rdi)
+	shrdq	$38,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,112(%rdi)
+	movq	120(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$3,%r9
+	andq	%rax,%r9
+	movq	%r9,120(%rdi)
+	movq	%r10,%r8
+	shrq	$32,%r8
+	andq	%rax,%r8
+	movq	%r8,128(%rdi)
+	shrdq	$61,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,136(%rdi)
+	xorq	%r8,%r8
+	movq	%r11,%r10
+	shrq	$26,%r10
+	andq	%rax,%r10
+	movq	%r10,144(%rdi)
+	shrdq	$55,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,152(%rdi)
+	movq	%r8,160(%rdi)
+	movq	%r8,168(%rdi)
+	movq	%r8,176(%rdi)
+	movq	%r8,184(%rdi)
+	ret
+
+
+.globl	_rsaz_1024_scatter5_avx2
+.private_extern _rsaz_1024_scatter5_avx2
+
+.p2align	5
+_rsaz_1024_scatter5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	vmovdqu	L$scatter_permd(%rip),%ymm5
+	shll	$4,%edx
+	leaq	(%rdi,%rdx,1),%rdi
+	movl	$9,%eax
+	jmp	L$oop_scatter_1024
+
+.p2align	5
+L$oop_scatter_1024:
+	vmovdqu	(%rsi),%ymm0
+	leaq	32(%rsi),%rsi
+	vpermd	%ymm0,%ymm5,%ymm0
+	vmovdqu	%xmm0,(%rdi)
+	leaq	512(%rdi),%rdi
+	decl	%eax
+	jnz	L$oop_scatter_1024
+
+	vzeroupper
+	ret
+
+
+
+.globl	_rsaz_1024_gather5_avx2
+.private_extern _rsaz_1024_gather5_avx2
+
+.p2align	5
+_rsaz_1024_gather5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	movq	%rsp,%r11
+
+	leaq	-256(%rsp),%rsp
+	andq	$-32,%rsp
+	leaq	L$inc(%rip),%r10
+	leaq	-128(%rsp),%rax
+
+	vmovd	%edx,%xmm4
+	vmovdqa	(%r10),%ymm0
+	vmovdqa	32(%r10),%ymm1
+	vmovdqa	64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4
+
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,0+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm0
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,32+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm1
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,64+128(%rax)
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vmovdqa	%ymm3,96+128(%rax)
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,128+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm8
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,160+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm9
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,192+128(%rax)
+	vpaddd	%ymm5,%ymm8,%ymm10
+	vpcmpeqd	%ymm4,%ymm8,%ymm8
+	vmovdqa	%ymm3,224+128(%rax)
+	vpaddd	%ymm5,%ymm9,%ymm11
+	vpcmpeqd	%ymm4,%ymm9,%ymm9
+	vpaddd	%ymm5,%ymm10,%ymm12
+	vpcmpeqd	%ymm4,%ymm10,%ymm10
+	vpaddd	%ymm5,%ymm11,%ymm13
+	vpcmpeqd	%ymm4,%ymm11,%ymm11
+	vpaddd	%ymm5,%ymm12,%ymm14
+	vpcmpeqd	%ymm4,%ymm12,%ymm12
+	vpaddd	%ymm5,%ymm13,%ymm15
+	vpcmpeqd	%ymm4,%ymm13,%ymm13
+	vpcmpeqd	%ymm4,%ymm14,%ymm14
+	vpcmpeqd	%ymm4,%ymm15,%ymm15
+
+	vmovdqa	-32(%r10),%ymm7
+	leaq	128(%rsi),%rsi
+	movl	$9,%edx
+
+L$oop_gather_1024:
+	vmovdqa	0-128(%rsi),%ymm0
+	vmovdqa	32-128(%rsi),%ymm1
+	vmovdqa	64-128(%rsi),%ymm2
+	vmovdqa	96-128(%rsi),%ymm3
+	vpand	0+128(%rax),%ymm0,%ymm0
+	vpand	32+128(%rax),%ymm1,%ymm1
+	vpand	64+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm1,%ymm4
+	vpand	96+128(%rax),%ymm3,%ymm3
+	vmovdqa	128-128(%rsi),%ymm0
+	vmovdqa	160-128(%rsi),%ymm1
+	vpor	%ymm2,%ymm3,%ymm5
+	vmovdqa	192-128(%rsi),%ymm2
+	vmovdqa	224-128(%rsi),%ymm3
+	vpand	128+128(%rax),%ymm0,%ymm0
+	vpand	160+128(%rax),%ymm1,%ymm1
+	vpand	192+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	224+128(%rax),%ymm3,%ymm3
+	vpand	256-128(%rsi),%ymm8,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	288-128(%rsi),%ymm9,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	320-128(%rsi),%ymm10,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	352-128(%rsi),%ymm11,%ymm3
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	384-128(%rsi),%ymm12,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	416-128(%rsi),%ymm13,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	448-128(%rsi),%ymm14,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	480-128(%rsi),%ymm15,%ymm3
+	leaq	512(%rsi),%rsi
+	vpor	%ymm0,%ymm4,%ymm4
+	vpor	%ymm1,%ymm5,%ymm5
+	vpor	%ymm2,%ymm4,%ymm4
+	vpor	%ymm3,%ymm5,%ymm5
+
+	vpor	%ymm5,%ymm4,%ymm4
+	vextracti128	$1,%ymm4,%xmm5
+	vpor	%xmm4,%xmm5,%xmm5
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqu	%ymm5,(%rdi)
+	leaq	32(%rdi),%rdi
+	decl	%edx
+	jnz	L$oop_gather_1024
+
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	vzeroupper
+	leaq	(%r11),%rsp
+
+	ret
+
+L$SEH_end_rsaz_1024_gather5:
+
+.section	__DATA,__const
+.p2align	6
+L$and_mask:
+.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+L$scatter_permd:
+.long	0,2,4,6,7,7,7,7
+L$gather_permd:
+.long	0,7,1,7,2,7,3,7
+L$inc:
+.long	0,0,0,0, 1,1,1,1
+.long	2,2,2,2, 3,3,3,3
+.long	4,4,4,4, 4,4,4,4
+.p2align	6
+.text	
+#endif
diff --git a/gen/bcm/rsaz-avx2-linux.S b/gen/bcm/rsaz-avx2-linux.S
new file mode 100644
index 0000000..65a6c2e
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-linux.S
@@ -0,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	rsaz_1024_sqr_avx2
+.hidden rsaz_1024_sqr_avx2
+.type	rsaz_1024_sqr_avx2,@function
+.align	64
+rsaz_1024_sqr_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+	movq	%rax,%rbp
+.cfi_def_cfa_register	%rbp
+	movq	%rdx,%r13
+	subq	$832,%rsp
+	movq	%r13,%r15
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	subq	$-128,%r13
+
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	vpxor	%ymm9,%ymm9,%ymm9
+	jz	.Lsqr_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%r13),%ymm0
+	andq	$-2048,%rsp
+	vmovdqu	32-128(%r13),%ymm1
+	vmovdqu	64-128(%r13),%ymm2
+	vmovdqu	96-128(%r13),%ymm3
+	vmovdqu	128-128(%r13),%ymm4
+	vmovdqu	160-128(%r13),%ymm5
+	vmovdqu	192-128(%r13),%ymm6
+	vmovdqu	224-128(%r13),%ymm7
+	vmovdqu	256-128(%r13),%ymm8
+	leaq	832+128(%rsp),%r13
+	vmovdqu	%ymm0,0-128(%r13)
+	vmovdqu	%ymm1,32-128(%r13)
+	vmovdqu	%ymm2,64-128(%r13)
+	vmovdqu	%ymm3,96-128(%r13)
+	vmovdqu	%ymm4,128-128(%r13)
+	vmovdqu	%ymm5,160-128(%r13)
+	vmovdqu	%ymm6,192-128(%r13)
+	vmovdqu	%ymm7,224-128(%r13)
+	vmovdqu	%ymm8,256-128(%r13)
+	vmovdqu	%ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+	andq	$-1024,%rsp
+
+	vmovdqu	32-128(%rsi),%ymm1
+	vmovdqu	64-128(%rsi),%ymm2
+	vmovdqu	96-128(%rsi),%ymm3
+	vmovdqu	128-128(%rsi),%ymm4
+	vmovdqu	160-128(%rsi),%ymm5
+	vmovdqu	192-128(%rsi),%ymm6
+	vmovdqu	224-128(%rsi),%ymm7
+	vmovdqu	256-128(%rsi),%ymm8
+
+	leaq	192(%rsp),%rbx
+	vmovdqu	.Land_mask(%rip),%ymm15
+	jmp	.LOOP_GRANDE_SQR_1024
+
+.align	32
+.LOOP_GRANDE_SQR_1024:
+	leaq	576+128(%rsp),%r9
+	leaq	448(%rsp),%r12
+
+
+
+
+	vpaddq	%ymm1,%ymm1,%ymm1
+	vpbroadcastq	0-128(%rsi),%ymm10
+	vpaddq	%ymm2,%ymm2,%ymm2
+	vmovdqa	%ymm1,0-128(%r9)
+	vpaddq	%ymm3,%ymm3,%ymm3
+	vmovdqa	%ymm2,32-128(%r9)
+	vpaddq	%ymm4,%ymm4,%ymm4
+	vmovdqa	%ymm3,64-128(%r9)
+	vpaddq	%ymm5,%ymm5,%ymm5
+	vmovdqa	%ymm4,96-128(%r9)
+	vpaddq	%ymm6,%ymm6,%ymm6
+	vmovdqa	%ymm5,128-128(%r9)
+	vpaddq	%ymm7,%ymm7,%ymm7
+	vmovdqa	%ymm6,160-128(%r9)
+	vpaddq	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm7,192-128(%r9)
+	vpxor	%ymm9,%ymm9,%ymm9
+	vmovdqa	%ymm8,224-128(%r9)
+
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpbroadcastq	32-128(%rsi),%ymm11
+	vmovdqu	%ymm9,288-192(%rbx)
+	vpmuludq	%ymm10,%ymm1,%ymm1
+	vmovdqu	%ymm9,320-448(%r12)
+	vpmuludq	%ymm10,%ymm2,%ymm2
+	vmovdqu	%ymm9,352-448(%r12)
+	vpmuludq	%ymm10,%ymm3,%ymm3
+	vmovdqu	%ymm9,384-448(%r12)
+	vpmuludq	%ymm10,%ymm4,%ymm4
+	vmovdqu	%ymm9,416-448(%r12)
+	vpmuludq	%ymm10,%ymm5,%ymm5
+	vmovdqu	%ymm9,448-448(%r12)
+	vpmuludq	%ymm10,%ymm6,%ymm6
+	vmovdqu	%ymm9,480-448(%r12)
+	vpmuludq	%ymm10,%ymm7,%ymm7
+	vmovdqu	%ymm9,512-448(%r12)
+	vpmuludq	%ymm10,%ymm8,%ymm8
+	vpbroadcastq	64-128(%rsi),%ymm10
+	vmovdqu	%ymm9,544-448(%r12)
+
+	movq	%rsi,%r15
+	movl	$4,%r14d
+	jmp	.Lsqr_entry_1024
+.align	32
+.LOOP_SQR_1024:
+	vpbroadcastq	32-128(%r15),%ymm11
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpaddq	0-192(%rbx),%ymm0,%ymm0
+	vpmuludq	0-128(%r9),%ymm10,%ymm1
+	vpaddq	32-192(%rbx),%ymm1,%ymm1
+	vpmuludq	32-128(%r9),%ymm10,%ymm2
+	vpaddq	64-192(%rbx),%ymm2,%ymm2
+	vpmuludq	64-128(%r9),%ymm10,%ymm3
+	vpaddq	96-192(%rbx),%ymm3,%ymm3
+	vpmuludq	96-128(%r9),%ymm10,%ymm4
+	vpaddq	128-192(%rbx),%ymm4,%ymm4
+	vpmuludq	128-128(%r9),%ymm10,%ymm5
+	vpaddq	160-192(%rbx),%ymm5,%ymm5
+	vpmuludq	160-128(%r9),%ymm10,%ymm6
+	vpaddq	192-192(%rbx),%ymm6,%ymm6
+	vpmuludq	192-128(%r9),%ymm10,%ymm7
+	vpaddq	224-192(%rbx),%ymm7,%ymm7
+	vpmuludq	224-128(%r9),%ymm10,%ymm8
+	vpbroadcastq	64-128(%r15),%ymm10
+	vpaddq	256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+	vmovdqu	%ymm0,0-192(%rbx)
+	vmovdqu	%ymm1,32-192(%rbx)
+
+	vpmuludq	32-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	32-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	64-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	96-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	128-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	160-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	192-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	224-128(%r9),%ymm11,%ymm0
+	vpbroadcastq	96-128(%r15),%ymm11
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+
+	vmovdqu	%ymm2,64-192(%rbx)
+	vmovdqu	%ymm3,96-192(%rbx)
+
+	vpmuludq	64-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	64-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	96-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	128-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	160-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	224-128(%r9),%ymm10,%ymm1
+	vpbroadcastq	128-128(%r15),%ymm10
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+
+	vmovdqu	%ymm4,128-192(%rbx)
+	vmovdqu	%ymm5,160-192(%rbx)
+
+	vpmuludq	96-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	96-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	128-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm0,%ymm0
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	224-128(%r9),%ymm11,%ymm2
+	vpbroadcastq	160-128(%r15),%ymm11
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+
+	vmovdqu	%ymm6,192-192(%rbx)
+	vmovdqu	%ymm7,224-192(%rbx)
+
+	vpmuludq	128-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	128-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	160-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	192-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	224-128(%r9),%ymm10,%ymm3
+	vpbroadcastq	192-128(%r15),%ymm10
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+
+	vmovdqu	%ymm8,256-192(%rbx)
+	vmovdqu	%ymm0,288-192(%rbx)
+	leaq	8(%rbx),%rbx
+
+	vpmuludq	160-128(%rsi),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	224-128(%r9),%ymm11,%ymm4
+	vpbroadcastq	224-128(%r15),%ymm11
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+
+	vmovdqu	%ymm1,320-448(%r12)
+	vmovdqu	%ymm2,352-448(%r12)
+
+	vpmuludq	192-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpbroadcastq	256-128(%r15),%ymm0
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	224-128(%r9),%ymm10,%ymm5
+	vpbroadcastq	0+8-128(%r15),%ymm10
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+
+	vmovdqu	%ymm3,384-448(%r12)
+	vmovdqu	%ymm4,416-448(%r12)
+	leaq	8(%r15),%r15
+
+	vpmuludq	224-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	224-128(%r9),%ymm11,%ymm6
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+
+	vpmuludq	256-128(%rsi),%ymm0,%ymm7
+	vmovdqu	%ymm5,448-448(%r12)
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vmovdqu	%ymm6,480-448(%r12)
+	vmovdqu	%ymm7,512-448(%r12)
+	leaq	8(%r12),%r12
+
+	decl	%r14d
+	jnz	.LOOP_SQR_1024
+
+	vmovdqu	256(%rsp),%ymm8
+	vmovdqu	288(%rsp),%ymm1
+	vmovdqu	320(%rsp),%ymm2
+	leaq	192(%rsp),%rbx
+
+	vpsrlq	$29,%ymm8,%ymm14
+	vpand	%ymm15,%ymm8,%ymm8
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+
+	vpermq	$0x93,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm9,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpblendd	$3,%ymm11,%ymm9,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,288-192(%rbx)
+	vmovdqu	%ymm2,320-192(%rbx)
+
+	movq	(%rsp),%rax
+	movq	8(%rsp),%r10
+	movq	16(%rsp),%r11
+	movq	24(%rsp),%r12
+	vmovdqu	32(%rsp),%ymm1
+	vmovdqu	64-192(%rbx),%ymm2
+	vmovdqu	96-192(%rbx),%ymm3
+	vmovdqu	128-192(%rbx),%ymm4
+	vmovdqu	160-192(%rbx),%ymm5
+	vmovdqu	192-192(%rbx),%ymm6
+	vmovdqu	224-192(%rbx),%ymm7
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpbroadcastq	%xmm12,%ymm12
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	shrq	$29,%r9
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	addq	%r9,%r10
+	addq	%rax,%r11
+	imulq	24-128(%r13),%rdx
+	addq	%rdx,%r12
+
+	movq	%r10,%rax
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+
+	movl	$9,%r14d
+	jmp	.LOOP_REDUCE_1024
+
+.align	32
+.LOOP_REDUCE_1024:
+	vmovd	%eax,%xmm13
+	vpbroadcastq	%xmm13,%ymm13
+
+	vpmuludq	32-128(%r13),%ymm12,%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm10,%ymm1,%ymm1
+	addq	%rax,%r10
+	vpmuludq	64-128(%r13),%ymm12,%ymm14
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm14,%ymm2,%ymm2
+	vpmuludq	96-128(%r13),%ymm12,%ymm11
+.byte	0x67
+	addq	%rax,%r11
+.byte	0x67
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	shrq	$29,%r10
+	vpaddq	%ymm11,%ymm3,%ymm3
+	vpmuludq	128-128(%r13),%ymm12,%ymm10
+	addq	%rax,%r12
+	addq	%r10,%r11
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpmuludq	160-128(%r13),%ymm12,%ymm14
+	movq	%r11,%rax
+	imull	%ecx,%eax
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpmuludq	192-128(%r13),%ymm12,%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpmuludq	224-128(%r13),%ymm12,%ymm10
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpmuludq	256-128(%r13),%ymm12,%ymm14
+	vmovd	%eax,%xmm12
+
+	vpaddq	%ymm14,%ymm8,%ymm8
+
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	32-8-128(%r13),%ymm13,%ymm11
+	vmovdqu	96-8-128(%r13),%ymm14
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm1,%ymm1
+	vpmuludq	64-8-128(%r13),%ymm13,%ymm10
+	vmovdqu	128-8-128(%r13),%ymm11
+	addq	%rax,%r11
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm10,%ymm2,%ymm2
+	addq	%r12,%rax
+	shrq	$29,%r11
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	160-8-128(%r13),%ymm10
+	addq	%r11,%rax
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	192-8-128(%r13),%ymm14
+.byte	0x67
+	movq	%rax,%r12
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm4,%ymm4
+	vpmuludq	%ymm13,%ymm10,%ymm10
+.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm5,%ymm5
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	256-8-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	288-8-128(%r13),%ymm9
+	vmovd	%eax,%xmm0
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm7,%ymm7
+	vpmuludq	%ymm13,%ymm10,%ymm10
+	vmovdqu	32-16-128(%r13),%ymm14
+	vpbroadcastq	%xmm0,%ymm0
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpmuludq	%ymm13,%ymm9,%ymm9
+	vmovdqu	64-16-128(%r13),%ymm11
+	addq	%rax,%r12
+
+	vmovdqu	32-24-128(%r13),%ymm13
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	96-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	%ymm0,%ymm13,%ymm13
+	vpmuludq	%ymm12,%ymm11,%ymm11
+.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+	vpaddq	%ymm1,%ymm13,%ymm13
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	160-16-128(%r13),%ymm11
+.byte	0x67
+	vmovq	%xmm13,%rax
+	vmovdqu	%ymm13,(%rsp)
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	192-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	%ymm12,%ymm11,%ymm11
+	vmovdqu	224-16-128(%r13),%ymm14
+	vpaddq	%ymm11,%ymm5,%ymm5
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	256-16-128(%r13),%ymm11
+	vpaddq	%ymm10,%ymm6,%ymm6
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	shrq	$29,%r12
+	vmovdqu	288-16-128(%r13),%ymm10
+	addq	%r12,%rax
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	%ymm12,%ymm11,%ymm11
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm8,%ymm8
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+	vmovdqu	96-24-128(%r13),%ymm11
+.byte	0x67
+	vpaddq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	64-24-128(%r13),%ymm0,%ymm14
+	vmovdqu	128-24-128(%r13),%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	movq	8(%rsp),%r10
+	vpaddq	%ymm14,%ymm2,%ymm1
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	160-24-128(%r13),%ymm14
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+.byte	0x67
+	shrq	$29,%r9
+	movq	16(%rsp),%r11
+	vpaddq	%ymm11,%ymm3,%ymm2
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vmovdqu	192-24-128(%r13),%ymm11
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	vpaddq	%ymm10,%ymm4,%ymm3
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	vmovdqu	224-24-128(%r13),%ymm10
+	imulq	24-128(%r13),%rdx
+	addq	%rax,%r11
+	leaq	(%r9,%r10,1),%rax
+	vpaddq	%ymm14,%ymm5,%ymm4
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	256-24-128(%r13),%ymm14
+	movq	%rax,%r10
+	imull	%ecx,%eax
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vpaddq	%ymm11,%ymm6,%ymm5
+	vmovdqu	288-24-128(%r13),%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm7,%ymm6
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	addq	24(%rsp),%rdx
+	vpaddq	%ymm14,%ymm8,%ymm7
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vpaddq	%ymm11,%ymm9,%ymm8
+	vmovq	%r12,%xmm9
+	movq	%rdx,%r12
+
+	decl	%r14d
+	jnz	.LOOP_REDUCE_1024
+	leaq	448(%rsp),%r12
+	vpaddq	%ymm9,%ymm13,%ymm0
+	vpxor	%ymm9,%ymm9,%ymm9
+
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vpaddq	544-448(%r12),%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vmovdqu	%ymm0,0-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,32-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vmovdqu	%ymm2,64-128(%rdi)
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vmovdqu	%ymm4,128-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vmovdqu	%ymm5,160-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vmovdqu	%ymm6,192-128(%rdi)
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+
+	movq	%rdi,%rsi
+	decl	%r8d
+	jne	.LOOP_GRANDE_SQR_1024
+
+	vzeroall
+	movq	%rbp,%rax
+.cfi_def_cfa_register	%rax
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr_1024_epilogue:
+	ret
+.cfi_endproc	
+.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl	rsaz_1024_mul_avx2
+.hidden rsaz_1024_mul_avx2
+.type	rsaz_1024_mul_avx2,@function
+.align	64
+rsaz_1024_mul_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	movq	%rax,%rbp
+.cfi_def_cfa_register	%rbp
+	vzeroall
+	movq	%rdx,%r13
+	subq	$64,%rsp
+
+
+
+
+
+
+.byte	0x67,0x67
+	movq	%rsi,%r15
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	movq	%rsi,%r15
+	cmovnzq	%r13,%rsi
+	cmovnzq	%r15,%r13
+
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	subq	$-128,%rcx
+	subq	$-128,%rdi
+
+	andq	$4095,%r15
+	addq	$320,%r15
+.byte	0x67,0x67
+	shrq	$12,%r15
+	jz	.Lmul_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%rcx),%ymm0
+	andq	$-512,%rsp
+	vmovdqu	32-128(%rcx),%ymm1
+	vmovdqu	64-128(%rcx),%ymm2
+	vmovdqu	96-128(%rcx),%ymm3
+	vmovdqu	128-128(%rcx),%ymm4
+	vmovdqu	160-128(%rcx),%ymm5
+	vmovdqu	192-128(%rcx),%ymm6
+	vmovdqu	224-128(%rcx),%ymm7
+	vmovdqu	256-128(%rcx),%ymm8
+	leaq	64+128(%rsp),%rcx
+	vmovdqu	%ymm0,0-128(%rcx)
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm1,32-128(%rcx)
+	vpxor	%ymm1,%ymm1,%ymm1
+	vmovdqu	%ymm2,64-128(%rcx)
+	vpxor	%ymm2,%ymm2,%ymm2
+	vmovdqu	%ymm3,96-128(%rcx)
+	vpxor	%ymm3,%ymm3,%ymm3
+	vmovdqu	%ymm4,128-128(%rcx)
+	vpxor	%ymm4,%ymm4,%ymm4
+	vmovdqu	%ymm5,160-128(%rcx)
+	vpxor	%ymm5,%ymm5,%ymm5
+	vmovdqu	%ymm6,192-128(%rcx)
+	vpxor	%ymm6,%ymm6,%ymm6
+	vmovdqu	%ymm7,224-128(%rcx)
+	vpxor	%ymm7,%ymm7,%ymm7
+	vmovdqu	%ymm8,256-128(%rcx)
+	vmovdqa	%ymm0,%ymm8
+	vmovdqu	%ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+	andq	$-64,%rsp
+
+	movq	(%r13),%rbx
+	vpbroadcastq	(%r13),%ymm10
+	vmovdqu	%ymm0,(%rsp)
+	xorq	%r9,%r9
+.byte	0x67
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+
+	vmovdqu	.Land_mask(%rip),%ymm15
+	movl	$9,%r14d
+	vmovdqu	%ymm9,288-128(%rdi)
+	jmp	.Loop_mul_1024
+
+.align	32
+.Loop_mul_1024:
+	vpsrlq	$29,%ymm3,%ymm9
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r9,%rax
+	movq	%rbx,%r10
+	imulq	8-128(%rsi),%r10
+	addq	8(%rsp),%r10
+
+	movq	%rax,%r9
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	movq	%rbx,%r11
+	imulq	16-128(%rsi),%r11
+	addq	16(%rsp),%r11
+
+	movq	%rbx,%r12
+	imulq	24-128(%rsi),%r12
+	addq	24(%rsp),%r12
+	vpmuludq	32-128(%rsi),%ymm10,%ymm0
+	vmovd	%eax,%xmm11
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	64-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	96-128(%rsi),%ymm10,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	128-128(%rsi),%ymm10,%ymm0
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	160-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	192-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	224-128(%rsi),%ymm10,%ymm0
+	vpermq	$0x93,%ymm9,%ymm9
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	256-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	8(%r13),%ymm10
+	vpaddq	%ymm12,%ymm8,%ymm8
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%rcx),%rax
+	addq	%rax,%r11
+	shrq	$29,%r9
+	imulq	24-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r9,%r10
+
+	vpmuludq	32-128(%rcx),%ymm11,%ymm13
+	vmovq	%xmm10,%rbx
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	64-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm2,%ymm2
+	vpmuludq	96-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	128-128(%rcx),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	160-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm5,%ymm5
+	vpmuludq	192-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	224-128(%rcx),%ymm11,%ymm13
+	vpblendd	$3,%ymm14,%ymm9,%ymm12
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	256-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm0,%ymm8,%ymm8
+
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rsi),%ymm12
+	movq	%rbx,%rax
+	imulq	8-128(%rsi),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rsi),%ymm13
+
+	movq	%r10,%rax
+	vpblendd	$0xfc,%ymm14,%ymm9,%ymm9
+	imull	%r8d,%eax
+	vpaddq	%ymm9,%ymm4,%ymm4
+	andl	$0x1fffffff,%eax
+
+	imulq	16-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovd	%eax,%xmm11
+	vmovdqu	-8+96-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-8+128-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+160-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+192-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-8+224-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+256-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+288-128(%rsi),%ymm9
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	16(%r13),%ymm10
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rcx),%ymm0
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rcx),%ymm12
+	shrq	$29,%r10
+	imulq	16-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r10,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-8+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rsi),%ymm0
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r11,%rax
+
+	vmovdqu	-16+64-128(%rsi),%ymm12
+	movq	%rax,%r11
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	imulq	8-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-16+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-16+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	24(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rcx),%ymm0
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-16+64-128(%rcx),%ymm12
+	imulq	8-128(%rcx),%rdx
+	addq	%rdx,%r12
+	shrq	$29,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-16+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+32-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+64-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	addq	%r11,%r12
+	imulq	-128(%rsi),%rbx
+	addq	%rbx,%r12
+
+	movq	%r12,%rax
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-24+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-24+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	32(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+	addq	$32,%r13
+
+	vmovdqu	-24+32-128(%rcx),%ymm0
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r12
+	shrq	$29,%r12
+
+	vmovdqu	-24+64-128(%rcx),%ymm12
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-24+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm0
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	%ymm0,(%rsp)
+	vpaddq	%ymm12,%ymm2,%ymm1
+	vmovdqu	-24+128-128(%rcx),%ymm0
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm2
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm3
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm4
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm5
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rcx),%ymm13
+	movq	%r12,%r9
+	vpaddq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	addq	(%rsp),%r9
+	vpaddq	%ymm12,%ymm8,%ymm7
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovq	%r12,%xmm12
+	vpaddq	%ymm13,%ymm9,%ymm8
+
+	decl	%r14d
+	jnz	.Loop_mul_1024
+	vpaddq	(%rsp),%ymm12,%ymm0
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm10,%ymm10
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpermq	$0x93,%ymm11,%ymm11
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm10,%ymm10
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0-128(%rdi)
+	vmovdqu	%ymm1,32-128(%rdi)
+	vmovdqu	%ymm2,64-128(%rdi)
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vmovdqu	%ymm4,128-128(%rdi)
+	vmovdqu	%ymm5,160-128(%rdi)
+	vmovdqu	%ymm6,192-128(%rdi)
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+	vzeroupper
+
+	movq	%rbp,%rax
+.cfi_def_cfa_register	%rax
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_1024_epilogue:
+	ret
+.cfi_endproc	
+.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl	rsaz_1024_red2norm_avx2
+.hidden rsaz_1024_red2norm_avx2
+.type	rsaz_1024_red2norm_avx2,@function
+.align	32
+rsaz_1024_red2norm_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	subq	$-128,%rsi
+	xorq	%rax,%rax
+	movq	-128(%rsi),%r8
+	movq	-120(%rsi),%r9
+	movq	-112(%rsi),%r10
+	shlq	$0,%r8
+	shlq	$29,%r9
+	movq	%r10,%r11
+	shlq	$58,%r10
+	shrq	$6,%r11
+	addq	%r8,%rax
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,0(%rdi)
+	movq	%r11,%rax
+	movq	-104(%rsi),%r8
+	movq	-96(%rsi),%r9
+	shlq	$23,%r8
+	movq	%r9,%r10
+	shlq	$52,%r9
+	shrq	$12,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,8(%rdi)
+	movq	%r10,%rax
+	movq	-88(%rsi),%r11
+	movq	-80(%rsi),%r8
+	shlq	$17,%r11
+	movq	%r8,%r9
+	shlq	$46,%r8
+	shrq	$18,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,16(%rdi)
+	movq	%r9,%rax
+	movq	-72(%rsi),%r10
+	movq	-64(%rsi),%r11
+	shlq	$11,%r10
+	movq	%r11,%r8
+	shlq	$40,%r11
+	shrq	$24,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,24(%rdi)
+	movq	%r8,%rax
+	movq	-56(%rsi),%r9
+	movq	-48(%rsi),%r10
+	movq	-40(%rsi),%r11
+	shlq	$5,%r9
+	shlq	$34,%r10
+	movq	%r11,%r8
+	shlq	$63,%r11
+	shrq	$1,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,32(%rdi)
+	movq	%r8,%rax
+	movq	-32(%rsi),%r9
+	movq	-24(%rsi),%r10
+	shlq	$28,%r9
+	movq	%r10,%r11
+	shlq	$57,%r10
+	shrq	$7,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,40(%rdi)
+	movq	%r11,%rax
+	movq	-16(%rsi),%r8
+	movq	-8(%rsi),%r9
+	shlq	$22,%r8
+	movq	%r9,%r10
+	shlq	$51,%r9
+	shrq	$13,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,48(%rdi)
+	movq	%r10,%rax
+	movq	0(%rsi),%r11
+	movq	8(%rsi),%r8
+	shlq	$16,%r11
+	movq	%r8,%r9
+	shlq	$45,%r8
+	shrq	$19,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,56(%rdi)
+	movq	%r9,%rax
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	shlq	$10,%r10
+	movq	%r11,%r8
+	shlq	$39,%r11
+	shrq	$25,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,64(%rdi)
+	movq	%r8,%rax
+	movq	32(%rsi),%r9
+	movq	40(%rsi),%r10
+	movq	48(%rsi),%r11
+	shlq	$4,%r9
+	shlq	$33,%r10
+	movq	%r11,%r8
+	shlq	$62,%r11
+	shrq	$2,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,72(%rdi)
+	movq	%r8,%rax
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	shlq	$27,%r9
+	movq	%r10,%r11
+	shlq	$56,%r10
+	shrq	$8,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,80(%rdi)
+	movq	%r11,%rax
+	movq	72(%rsi),%r8
+	movq	80(%rsi),%r9
+	shlq	$21,%r8
+	movq	%r9,%r10
+	shlq	$50,%r9
+	shrq	$14,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,88(%rdi)
+	movq	%r10,%rax
+	movq	88(%rsi),%r11
+	movq	96(%rsi),%r8
+	shlq	$15,%r11
+	movq	%r8,%r9
+	shlq	$44,%r8
+	shrq	$20,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,96(%rdi)
+	movq	%r9,%rax
+	movq	104(%rsi),%r10
+	movq	112(%rsi),%r11
+	shlq	$9,%r10
+	movq	%r11,%r8
+	shlq	$38,%r11
+	shrq	$26,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,104(%rdi)
+	movq	%r8,%rax
+	movq	120(%rsi),%r9
+	movq	128(%rsi),%r10
+	movq	136(%rsi),%r11
+	shlq	$3,%r9
+	shlq	$32,%r10
+	movq	%r11,%r8
+	shlq	$61,%r11
+	shrq	$3,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,112(%rdi)
+	movq	%r8,%rax
+	movq	144(%rsi),%r9
+	movq	152(%rsi),%r10
+	shlq	$26,%r9
+	movq	%r10,%r11
+	shlq	$55,%r10
+	shrq	$9,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,120(%rdi)
+	movq	%r11,%rax
+	ret
+.cfi_endproc	
+.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl	rsaz_1024_norm2red_avx2
+.hidden rsaz_1024_norm2red_avx2
+.type	rsaz_1024_norm2red_avx2,@function
+.align	32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	subq	$-128,%rdi
+	movq	(%rsi),%r8
+	movl	$0x1fffffff,%eax
+	movq	8(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$0,%r11
+	andq	%rax,%r11
+	movq	%r11,-128(%rdi)
+	movq	%r8,%r10
+	shrq	$29,%r10
+	andq	%rax,%r10
+	movq	%r10,-120(%rdi)
+	shrdq	$58,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-112(%rdi)
+	movq	16(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$23,%r8
+	andq	%rax,%r8
+	movq	%r8,-104(%rdi)
+	shrdq	$52,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-96(%rdi)
+	movq	24(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$17,%r9
+	andq	%rax,%r9
+	movq	%r9,-88(%rdi)
+	shrdq	$46,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-80(%rdi)
+	movq	32(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$11,%r10
+	andq	%rax,%r10
+	movq	%r10,-72(%rdi)
+	shrdq	$40,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,-64(%rdi)
+	movq	40(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$5,%r11
+	andq	%rax,%r11
+	movq	%r11,-56(%rdi)
+	movq	%r8,%r10
+	shrq	$34,%r10
+	andq	%rax,%r10
+	movq	%r10,-48(%rdi)
+	shrdq	$63,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-40(%rdi)
+	movq	48(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$28,%r8
+	andq	%rax,%r8
+	movq	%r8,-32(%rdi)
+	shrdq	$57,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-24(%rdi)
+	movq	56(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$22,%r9
+	andq	%rax,%r9
+	movq	%r9,-16(%rdi)
+	shrdq	$51,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-8(%rdi)
+	movq	64(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$16,%r10
+	andq	%rax,%r10
+	movq	%r10,0(%rdi)
+	shrdq	$45,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,8(%rdi)
+	movq	72(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$10,%r11
+	andq	%rax,%r11
+	movq	%r11,16(%rdi)
+	shrdq	$39,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,24(%rdi)
+	movq	80(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$4,%r8
+	andq	%rax,%r8
+	movq	%r8,32(%rdi)
+	movq	%r9,%r11
+	shrq	$33,%r11
+	andq	%rax,%r11
+	movq	%r11,40(%rdi)
+	shrdq	$62,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,48(%rdi)
+	movq	88(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$27,%r9
+	andq	%rax,%r9
+	movq	%r9,56(%rdi)
+	shrdq	$56,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,64(%rdi)
+	movq	96(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$21,%r10
+	andq	%rax,%r10
+	movq	%r10,72(%rdi)
+	shrdq	$50,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,80(%rdi)
+	movq	104(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$15,%r11
+	andq	%rax,%r11
+	movq	%r11,88(%rdi)
+	shrdq	$44,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,96(%rdi)
+	movq	112(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$9,%r8
+	andq	%rax,%r8
+	movq	%r8,104(%rdi)
+	shrdq	$38,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,112(%rdi)
+	movq	120(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$3,%r9
+	andq	%rax,%r9
+	movq	%r9,120(%rdi)
+	movq	%r10,%r8
+	shrq	$32,%r8
+	andq	%rax,%r8
+	movq	%r8,128(%rdi)
+	shrdq	$61,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,136(%rdi)
+	xorq	%r8,%r8
+	movq	%r11,%r10
+	shrq	$26,%r10
+	andq	%rax,%r10
+	movq	%r10,144(%rdi)
+	shrdq	$55,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,152(%rdi)
+	movq	%r8,160(%rdi)
+	movq	%r8,168(%rdi)
+	movq	%r8,176(%rdi)
+	movq	%r8,184(%rdi)
+	ret
+.cfi_endproc	
+.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl	rsaz_1024_scatter5_avx2
+.hidden rsaz_1024_scatter5_avx2
+.type	rsaz_1024_scatter5_avx2,@function
+.align	32
+rsaz_1024_scatter5_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+	vmovdqu	.Lscatter_permd(%rip),%ymm5
+	shll	$4,%edx
+	leaq	(%rdi,%rdx,1),%rdi
+	movl	$9,%eax
+	jmp	.Loop_scatter_1024
+
+.align	32
+.Loop_scatter_1024:
+	vmovdqu	(%rsi),%ymm0
+	leaq	32(%rsi),%rsi
+	vpermd	%ymm0,%ymm5,%ymm0
+	vmovdqu	%xmm0,(%rdi)
+	leaq	512(%rdi),%rdi
+	decl	%eax
+	jnz	.Loop_scatter_1024
+
+	vzeroupper
+	ret
+.cfi_endproc	
+.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl	rsaz_1024_gather5_avx2
+.hidden rsaz_1024_gather5_avx2
+.type	rsaz_1024_gather5_avx2,@function
+.align	32
+rsaz_1024_gather5_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	vzeroupper
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	leaq	-256(%rsp),%rsp
+	andq	$-32,%rsp
+	leaq	.Linc(%rip),%r10
+	leaq	-128(%rsp),%rax
+
+	vmovd	%edx,%xmm4
+	vmovdqa	(%r10),%ymm0
+	vmovdqa	32(%r10),%ymm1
+	vmovdqa	64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4
+
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,0+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm0
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,32+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm1
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,64+128(%rax)
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vmovdqa	%ymm3,96+128(%rax)
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,128+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm8
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,160+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm9
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,192+128(%rax)
+	vpaddd	%ymm5,%ymm8,%ymm10
+	vpcmpeqd	%ymm4,%ymm8,%ymm8
+	vmovdqa	%ymm3,224+128(%rax)
+	vpaddd	%ymm5,%ymm9,%ymm11
+	vpcmpeqd	%ymm4,%ymm9,%ymm9
+	vpaddd	%ymm5,%ymm10,%ymm12
+	vpcmpeqd	%ymm4,%ymm10,%ymm10
+	vpaddd	%ymm5,%ymm11,%ymm13
+	vpcmpeqd	%ymm4,%ymm11,%ymm11
+	vpaddd	%ymm5,%ymm12,%ymm14
+	vpcmpeqd	%ymm4,%ymm12,%ymm12
+	vpaddd	%ymm5,%ymm13,%ymm15
+	vpcmpeqd	%ymm4,%ymm13,%ymm13
+	vpcmpeqd	%ymm4,%ymm14,%ymm14
+	vpcmpeqd	%ymm4,%ymm15,%ymm15
+
+	vmovdqa	-32(%r10),%ymm7
+	leaq	128(%rsi),%rsi
+	movl	$9,%edx
+
+.Loop_gather_1024:
+	vmovdqa	0-128(%rsi),%ymm0
+	vmovdqa	32-128(%rsi),%ymm1
+	vmovdqa	64-128(%rsi),%ymm2
+	vmovdqa	96-128(%rsi),%ymm3
+	vpand	0+128(%rax),%ymm0,%ymm0
+	vpand	32+128(%rax),%ymm1,%ymm1
+	vpand	64+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm1,%ymm4
+	vpand	96+128(%rax),%ymm3,%ymm3
+	vmovdqa	128-128(%rsi),%ymm0
+	vmovdqa	160-128(%rsi),%ymm1
+	vpor	%ymm2,%ymm3,%ymm5
+	vmovdqa	192-128(%rsi),%ymm2
+	vmovdqa	224-128(%rsi),%ymm3
+	vpand	128+128(%rax),%ymm0,%ymm0
+	vpand	160+128(%rax),%ymm1,%ymm1
+	vpand	192+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	224+128(%rax),%ymm3,%ymm3
+	vpand	256-128(%rsi),%ymm8,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	288-128(%rsi),%ymm9,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	320-128(%rsi),%ymm10,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	352-128(%rsi),%ymm11,%ymm3
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	384-128(%rsi),%ymm12,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	416-128(%rsi),%ymm13,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	448-128(%rsi),%ymm14,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	480-128(%rsi),%ymm15,%ymm3
+	leaq	512(%rsi),%rsi
+	vpor	%ymm0,%ymm4,%ymm4
+	vpor	%ymm1,%ymm5,%ymm5
+	vpor	%ymm2,%ymm4,%ymm4
+	vpor	%ymm3,%ymm5,%ymm5
+
+	vpor	%ymm5,%ymm4,%ymm4
+	vextracti128	$1,%ymm4,%xmm5
+	vpor	%xmm4,%xmm5,%xmm5
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqu	%ymm5,(%rdi)
+	leaq	32(%rdi),%rdi
+	decl	%edx
+	jnz	.Loop_gather_1024
+
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	vzeroupper
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+	ret
+.cfi_endproc	
+.LSEH_end_rsaz_1024_gather5:
+.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+.section	.rodata
+.align	64
+.Land_mask:
+.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long	0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long	0,7,1,7,2,7,3,7
+.Linc:
+.long	0,0,0,0, 1,1,1,1
+.long	2,2,2,2, 3,3,3,3
+.long	4,4,4,4, 4,4,4,4
+.align	64
+.text	
+#endif
diff --git a/gen/bcm/rsaz-avx2-win.asm b/gen/bcm/rsaz-avx2-win.asm
new file mode 100644
index 0000000..beadbdd
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-win.asm
@@ -0,0 +1,1987 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+global	rsaz_1024_sqr_avx2
+
+ALIGN	64
+rsaz_1024_sqr_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_rsaz_1024_sqr_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	lea	rax,[rsp]
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	vzeroupper
+	lea	rsp,[((-168))+rsp]
+	vmovaps	XMMWORD[(-216)+rax],xmm6
+	vmovaps	XMMWORD[(-200)+rax],xmm7
+	vmovaps	XMMWORD[(-184)+rax],xmm8
+	vmovaps	XMMWORD[(-168)+rax],xmm9
+	vmovaps	XMMWORD[(-152)+rax],xmm10
+	vmovaps	XMMWORD[(-136)+rax],xmm11
+	vmovaps	XMMWORD[(-120)+rax],xmm12
+	vmovaps	XMMWORD[(-104)+rax],xmm13
+	vmovaps	XMMWORD[(-88)+rax],xmm14
+	vmovaps	XMMWORD[(-72)+rax],xmm15
+$L$sqr_1024_body:
+	mov	rbp,rax
+
+	mov	r13,rdx
+	sub	rsp,832
+	mov	r15,r13
+	sub	rdi,-128
+	sub	rsi,-128
+	sub	r13,-128
+
+	and	r15,4095
+	add	r15,32*10
+	shr	r15,12
+	vpxor	ymm9,ymm9,ymm9
+	jz	NEAR $L$sqr_1024_no_n_copy
+
+
+
+
+
+	sub	rsp,32*10
+	vmovdqu	ymm0,YMMWORD[((0-128))+r13]
+	and	rsp,-2048
+	vmovdqu	ymm1,YMMWORD[((32-128))+r13]
+	vmovdqu	ymm2,YMMWORD[((64-128))+r13]
+	vmovdqu	ymm3,YMMWORD[((96-128))+r13]
+	vmovdqu	ymm4,YMMWORD[((128-128))+r13]
+	vmovdqu	ymm5,YMMWORD[((160-128))+r13]
+	vmovdqu	ymm6,YMMWORD[((192-128))+r13]
+	vmovdqu	ymm7,YMMWORD[((224-128))+r13]
+	vmovdqu	ymm8,YMMWORD[((256-128))+r13]
+	lea	r13,[((832+128))+rsp]
+	vmovdqu	YMMWORD[(0-128)+r13],ymm0
+	vmovdqu	YMMWORD[(32-128)+r13],ymm1
+	vmovdqu	YMMWORD[(64-128)+r13],ymm2
+	vmovdqu	YMMWORD[(96-128)+r13],ymm3
+	vmovdqu	YMMWORD[(128-128)+r13],ymm4
+	vmovdqu	YMMWORD[(160-128)+r13],ymm5
+	vmovdqu	YMMWORD[(192-128)+r13],ymm6
+	vmovdqu	YMMWORD[(224-128)+r13],ymm7
+	vmovdqu	YMMWORD[(256-128)+r13],ymm8
+	vmovdqu	YMMWORD[(288-128)+r13],ymm9
+
+$L$sqr_1024_no_n_copy:
+	and	rsp,-1024
+
+	vmovdqu	ymm1,YMMWORD[((32-128))+rsi]
+	vmovdqu	ymm2,YMMWORD[((64-128))+rsi]
+	vmovdqu	ymm3,YMMWORD[((96-128))+rsi]
+	vmovdqu	ymm4,YMMWORD[((128-128))+rsi]
+	vmovdqu	ymm5,YMMWORD[((160-128))+rsi]
+	vmovdqu	ymm6,YMMWORD[((192-128))+rsi]
+	vmovdqu	ymm7,YMMWORD[((224-128))+rsi]
+	vmovdqu	ymm8,YMMWORD[((256-128))+rsi]
+
+	lea	rbx,[192+rsp]
+	vmovdqu	ymm15,YMMWORD[$L$and_mask]
+	jmp	NEAR $L$OOP_GRANDE_SQR_1024
+
+ALIGN	32
+$L$OOP_GRANDE_SQR_1024:
+	lea	r9,[((576+128))+rsp]
+	lea	r12,[448+rsp]
+
+
+
+
+	vpaddq	ymm1,ymm1,ymm1
+	vpbroadcastq	ymm10,QWORD[((0-128))+rsi]
+	vpaddq	ymm2,ymm2,ymm2
+	vmovdqa	YMMWORD[(0-128)+r9],ymm1
+	vpaddq	ymm3,ymm3,ymm3
+	vmovdqa	YMMWORD[(32-128)+r9],ymm2
+	vpaddq	ymm4,ymm4,ymm4
+	vmovdqa	YMMWORD[(64-128)+r9],ymm3
+	vpaddq	ymm5,ymm5,ymm5
+	vmovdqa	YMMWORD[(96-128)+r9],ymm4
+	vpaddq	ymm6,ymm6,ymm6
+	vmovdqa	YMMWORD[(128-128)+r9],ymm5
+	vpaddq	ymm7,ymm7,ymm7
+	vmovdqa	YMMWORD[(160-128)+r9],ymm6
+	vpaddq	ymm8,ymm8,ymm8
+	vmovdqa	YMMWORD[(192-128)+r9],ymm7
+	vpxor	ymm9,ymm9,ymm9
+	vmovdqa	YMMWORD[(224-128)+r9],ymm8
+
+	vpmuludq	ymm0,ymm10,YMMWORD[((0-128))+rsi]
+	vpbroadcastq	ymm11,QWORD[((32-128))+rsi]
+	vmovdqu	YMMWORD[(288-192)+rbx],ymm9
+	vpmuludq	ymm1,ymm1,ymm10
+	vmovdqu	YMMWORD[(320-448)+r12],ymm9
+	vpmuludq	ymm2,ymm2,ymm10
+	vmovdqu	YMMWORD[(352-448)+r12],ymm9
+	vpmuludq	ymm3,ymm3,ymm10
+	vmovdqu	YMMWORD[(384-448)+r12],ymm9
+	vpmuludq	ymm4,ymm4,ymm10
+	vmovdqu	YMMWORD[(416-448)+r12],ymm9
+	vpmuludq	ymm5,ymm5,ymm10
+	vmovdqu	YMMWORD[(448-448)+r12],ymm9
+	vpmuludq	ymm6,ymm6,ymm10
+	vmovdqu	YMMWORD[(480-448)+r12],ymm9
+	vpmuludq	ymm7,ymm7,ymm10
+	vmovdqu	YMMWORD[(512-448)+r12],ymm9
+	vpmuludq	ymm8,ymm8,ymm10
+	vpbroadcastq	ymm10,QWORD[((64-128))+rsi]
+	vmovdqu	YMMWORD[(544-448)+r12],ymm9
+
+	mov	r15,rsi
+	mov	r14d,4
+	jmp	NEAR $L$sqr_entry_1024
+ALIGN	32
+$L$OOP_SQR_1024:
+	vpbroadcastq	ymm11,QWORD[((32-128))+r15]
+	vpmuludq	ymm0,ymm10,YMMWORD[((0-128))+rsi]
+	vpaddq	ymm0,ymm0,YMMWORD[((0-192))+rbx]
+	vpmuludq	ymm1,ymm10,YMMWORD[((0-128))+r9]
+	vpaddq	ymm1,ymm1,YMMWORD[((32-192))+rbx]
+	vpmuludq	ymm2,ymm10,YMMWORD[((32-128))+r9]
+	vpaddq	ymm2,ymm2,YMMWORD[((64-192))+rbx]
+	vpmuludq	ymm3,ymm10,YMMWORD[((64-128))+r9]
+	vpaddq	ymm3,ymm3,YMMWORD[((96-192))+rbx]
+	vpmuludq	ymm4,ymm10,YMMWORD[((96-128))+r9]
+	vpaddq	ymm4,ymm4,YMMWORD[((128-192))+rbx]
+	vpmuludq	ymm5,ymm10,YMMWORD[((128-128))+r9]
+	vpaddq	ymm5,ymm5,YMMWORD[((160-192))+rbx]
+	vpmuludq	ymm6,ymm10,YMMWORD[((160-128))+r9]
+	vpaddq	ymm6,ymm6,YMMWORD[((192-192))+rbx]
+	vpmuludq	ymm7,ymm10,YMMWORD[((192-128))+r9]
+	vpaddq	ymm7,ymm7,YMMWORD[((224-192))+rbx]
+	vpmuludq	ymm8,ymm10,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm10,QWORD[((64-128))+r15]
+	vpaddq	ymm8,ymm8,YMMWORD[((256-192))+rbx]
+$L$sqr_entry_1024:
+	vmovdqu	YMMWORD[(0-192)+rbx],ymm0
+	vmovdqu	YMMWORD[(32-192)+rbx],ymm1
+
+	vpmuludq	ymm12,ymm11,YMMWORD[((32-128))+rsi]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm14,ymm11,YMMWORD[((32-128))+r9]
+	vpaddq	ymm3,ymm3,ymm14
+	vpmuludq	ymm13,ymm11,YMMWORD[((64-128))+r9]
+	vpaddq	ymm4,ymm4,ymm13
+	vpmuludq	ymm12,ymm11,YMMWORD[((96-128))+r9]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm14,ymm11,YMMWORD[((128-128))+r9]
+	vpaddq	ymm6,ymm6,ymm14
+	vpmuludq	ymm13,ymm11,YMMWORD[((160-128))+r9]
+	vpaddq	ymm7,ymm7,ymm13
+	vpmuludq	ymm12,ymm11,YMMWORD[((192-128))+r9]
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm0,ymm11,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm11,QWORD[((96-128))+r15]
+	vpaddq	ymm0,ymm0,YMMWORD[((288-192))+rbx]
+
+	vmovdqu	YMMWORD[(64-192)+rbx],ymm2
+	vmovdqu	YMMWORD[(96-192)+rbx],ymm3
+
+	vpmuludq	ymm13,ymm10,YMMWORD[((64-128))+rsi]
+	vpaddq	ymm4,ymm4,ymm13
+	vpmuludq	ymm12,ymm10,YMMWORD[((64-128))+r9]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm14,ymm10,YMMWORD[((96-128))+r9]
+	vpaddq	ymm6,ymm6,ymm14
+	vpmuludq	ymm13,ymm10,YMMWORD[((128-128))+r9]
+	vpaddq	ymm7,ymm7,ymm13
+	vpmuludq	ymm12,ymm10,YMMWORD[((160-128))+r9]
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm14,ymm10,YMMWORD[((192-128))+r9]
+	vpaddq	ymm0,ymm0,ymm14
+	vpmuludq	ymm1,ymm10,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm10,QWORD[((128-128))+r15]
+	vpaddq	ymm1,ymm1,YMMWORD[((320-448))+r12]
+
+	vmovdqu	YMMWORD[(128-192)+rbx],ymm4
+	vmovdqu	YMMWORD[(160-192)+rbx],ymm5
+
+	vpmuludq	ymm12,ymm11,YMMWORD[((96-128))+rsi]
+	vpaddq	ymm6,ymm6,ymm12
+	vpmuludq	ymm14,ymm11,YMMWORD[((96-128))+r9]
+	vpaddq	ymm7,ymm7,ymm14
+	vpmuludq	ymm13,ymm11,YMMWORD[((128-128))+r9]
+	vpaddq	ymm8,ymm8,ymm13
+	vpmuludq	ymm12,ymm11,YMMWORD[((160-128))+r9]
+	vpaddq	ymm0,ymm0,ymm12
+	vpmuludq	ymm14,ymm11,YMMWORD[((192-128))+r9]
+	vpaddq	ymm1,ymm1,ymm14
+	vpmuludq	ymm2,ymm11,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm11,QWORD[((160-128))+r15]
+	vpaddq	ymm2,ymm2,YMMWORD[((352-448))+r12]
+
+	vmovdqu	YMMWORD[(192-192)+rbx],ymm6
+	vmovdqu	YMMWORD[(224-192)+rbx],ymm7
+
+	vpmuludq	ymm12,ymm10,YMMWORD[((128-128))+rsi]
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm14,ymm10,YMMWORD[((128-128))+r9]
+	vpaddq	ymm0,ymm0,ymm14
+	vpmuludq	ymm13,ymm10,YMMWORD[((160-128))+r9]
+	vpaddq	ymm1,ymm1,ymm13
+	vpmuludq	ymm12,ymm10,YMMWORD[((192-128))+r9]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm3,ymm10,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm10,QWORD[((192-128))+r15]
+	vpaddq	ymm3,ymm3,YMMWORD[((384-448))+r12]
+
+	vmovdqu	YMMWORD[(256-192)+rbx],ymm8
+	vmovdqu	YMMWORD[(288-192)+rbx],ymm0
+	lea	rbx,[8+rbx]
+
+	vpmuludq	ymm13,ymm11,YMMWORD[((160-128))+rsi]
+	vpaddq	ymm1,ymm1,ymm13
+	vpmuludq	ymm12,ymm11,YMMWORD[((160-128))+r9]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm14,ymm11,YMMWORD[((192-128))+r9]
+	vpaddq	ymm3,ymm3,ymm14
+	vpmuludq	ymm4,ymm11,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm11,QWORD[((224-128))+r15]
+	vpaddq	ymm4,ymm4,YMMWORD[((416-448))+r12]
+
+	vmovdqu	YMMWORD[(320-448)+r12],ymm1
+	vmovdqu	YMMWORD[(352-448)+r12],ymm2
+
+	vpmuludq	ymm12,ymm10,YMMWORD[((192-128))+rsi]
+	vpaddq	ymm3,ymm3,ymm12
+	vpmuludq	ymm14,ymm10,YMMWORD[((192-128))+r9]
+	vpbroadcastq	ymm0,QWORD[((256-128))+r15]
+	vpaddq	ymm4,ymm4,ymm14
+	vpmuludq	ymm5,ymm10,YMMWORD[((224-128))+r9]
+	vpbroadcastq	ymm10,QWORD[((0+8-128))+r15]
+	vpaddq	ymm5,ymm5,YMMWORD[((448-448))+r12]
+
+	vmovdqu	YMMWORD[(384-448)+r12],ymm3
+	vmovdqu	YMMWORD[(416-448)+r12],ymm4
+	lea	r15,[8+r15]
+
+	vpmuludq	ymm12,ymm11,YMMWORD[((224-128))+rsi]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm6,ymm11,YMMWORD[((224-128))+r9]
+	vpaddq	ymm6,ymm6,YMMWORD[((480-448))+r12]
+
+	vpmuludq	ymm7,ymm0,YMMWORD[((256-128))+rsi]
+	vmovdqu	YMMWORD[(448-448)+r12],ymm5
+	vpaddq	ymm7,ymm7,YMMWORD[((512-448))+r12]
+	vmovdqu	YMMWORD[(480-448)+r12],ymm6
+	vmovdqu	YMMWORD[(512-448)+r12],ymm7
+	lea	r12,[8+r12]
+
+	dec	r14d
+	jnz	NEAR $L$OOP_SQR_1024
+
+	vmovdqu	ymm8,YMMWORD[256+rsp]
+	vmovdqu	ymm1,YMMWORD[288+rsp]
+	vmovdqu	ymm2,YMMWORD[320+rsp]
+	lea	rbx,[192+rsp]
+
+	vpsrlq	ymm14,ymm8,29
+	vpand	ymm8,ymm8,ymm15
+	vpsrlq	ymm11,ymm1,29
+	vpand	ymm1,ymm1,ymm15
+
+	vpermq	ymm14,ymm14,0x93
+	vpxor	ymm9,ymm9,ymm9
+	vpermq	ymm11,ymm11,0x93
+
+	vpblendd	ymm10,ymm14,ymm9,3
+	vpblendd	ymm14,ymm11,ymm14,3
+	vpaddq	ymm8,ymm8,ymm10
+	vpblendd	ymm11,ymm9,ymm11,3
+	vpaddq	ymm1,ymm1,ymm14
+	vpaddq	ymm2,ymm2,ymm11
+	vmovdqu	YMMWORD[(288-192)+rbx],ymm1
+	vmovdqu	YMMWORD[(320-192)+rbx],ymm2
+
+	mov	rax,QWORD[rsp]
+	mov	r10,QWORD[8+rsp]
+	mov	r11,QWORD[16+rsp]
+	mov	r12,QWORD[24+rsp]
+	vmovdqu	ymm1,YMMWORD[32+rsp]
+	vmovdqu	ymm2,YMMWORD[((64-192))+rbx]
+	vmovdqu	ymm3,YMMWORD[((96-192))+rbx]
+	vmovdqu	ymm4,YMMWORD[((128-192))+rbx]
+	vmovdqu	ymm5,YMMWORD[((160-192))+rbx]
+	vmovdqu	ymm6,YMMWORD[((192-192))+rbx]
+	vmovdqu	ymm7,YMMWORD[((224-192))+rbx]
+
+	mov	r9,rax
+	imul	eax,ecx
+	and	eax,0x1fffffff
+	vmovd	xmm12,eax
+
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+r13]
+	vpbroadcastq	ymm12,xmm12
+	add	r9,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+r13]
+	shr	r9,29
+	add	r10,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((16-128))+r13]
+	add	r10,r9
+	add	r11,rax
+	imul	rdx,QWORD[((24-128))+r13]
+	add	r12,rdx
+
+	mov	rax,r10
+	imul	eax,ecx
+	and	eax,0x1fffffff
+
+	mov	r14d,9
+	jmp	NEAR $L$OOP_REDUCE_1024
+
+ALIGN	32
+$L$OOP_REDUCE_1024:
+	vmovd	xmm13,eax
+	vpbroadcastq	ymm13,xmm13
+
+	vpmuludq	ymm10,ymm12,YMMWORD[((32-128))+r13]
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+r13]
+	vpaddq	ymm1,ymm1,ymm10
+	add	r10,rax
+	vpmuludq	ymm14,ymm12,YMMWORD[((64-128))+r13]
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+r13]
+	vpaddq	ymm2,ymm2,ymm14
+	vpmuludq	ymm11,ymm12,YMMWORD[((96-128))+r13]
+	DB	0x67
+	add	r11,rax
+	DB	0x67
+	mov	rax,rdx
+	imul	rax,QWORD[((16-128))+r13]
+	shr	r10,29
+	vpaddq	ymm3,ymm3,ymm11
+	vpmuludq	ymm10,ymm12,YMMWORD[((128-128))+r13]
+	add	r12,rax
+	add	r11,r10
+	vpaddq	ymm4,ymm4,ymm10
+	vpmuludq	ymm14,ymm12,YMMWORD[((160-128))+r13]
+	mov	rax,r11
+	imul	eax,ecx
+	vpaddq	ymm5,ymm5,ymm14
+	vpmuludq	ymm11,ymm12,YMMWORD[((192-128))+r13]
+	and	eax,0x1fffffff
+	vpaddq	ymm6,ymm6,ymm11
+	vpmuludq	ymm10,ymm12,YMMWORD[((224-128))+r13]
+	vpaddq	ymm7,ymm7,ymm10
+	vpmuludq	ymm14,ymm12,YMMWORD[((256-128))+r13]
+	vmovd	xmm12,eax
+
+	vpaddq	ymm8,ymm8,ymm14
+
+	vpbroadcastq	ymm12,xmm12
+
+	vpmuludq	ymm11,ymm13,YMMWORD[((32-8-128))+r13]
+	vmovdqu	ymm14,YMMWORD[((96-8-128))+r13]
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+r13]
+	vpaddq	ymm1,ymm1,ymm11
+	vpmuludq	ymm10,ymm13,YMMWORD[((64-8-128))+r13]
+	vmovdqu	ymm11,YMMWORD[((128-8-128))+r13]
+	add	r11,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+r13]
+	vpaddq	ymm2,ymm2,ymm10
+	add	rax,r12
+	shr	r11,29
+	vpmuludq	ymm14,ymm14,ymm13
+	vmovdqu	ymm10,YMMWORD[((160-8-128))+r13]
+	add	rax,r11
+	vpaddq	ymm3,ymm3,ymm14
+	vpmuludq	ymm11,ymm11,ymm13
+	vmovdqu	ymm14,YMMWORD[((192-8-128))+r13]
+	DB	0x67
+	mov	r12,rax
+	imul	eax,ecx
+	vpaddq	ymm4,ymm4,ymm11
+	vpmuludq	ymm10,ymm10,ymm13
+	DB	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+	and	eax,0x1fffffff
+	vpaddq	ymm5,ymm5,ymm10
+	vpmuludq	ymm14,ymm14,ymm13
+	vmovdqu	ymm10,YMMWORD[((256-8-128))+r13]
+	vpaddq	ymm6,ymm6,ymm14
+	vpmuludq	ymm11,ymm11,ymm13
+	vmovdqu	ymm9,YMMWORD[((288-8-128))+r13]
+	vmovd	xmm0,eax
+	imul	rax,QWORD[((-128))+r13]
+	vpaddq	ymm7,ymm7,ymm11
+	vpmuludq	ymm10,ymm10,ymm13
+	vmovdqu	ymm14,YMMWORD[((32-16-128))+r13]
+	vpbroadcastq	ymm0,xmm0
+	vpaddq	ymm8,ymm8,ymm10
+	vpmuludq	ymm9,ymm9,ymm13
+	vmovdqu	ymm11,YMMWORD[((64-16-128))+r13]
+	add	r12,rax
+
+	vmovdqu	ymm13,YMMWORD[((32-24-128))+r13]
+	vpmuludq	ymm14,ymm14,ymm12
+	vmovdqu	ymm10,YMMWORD[((96-16-128))+r13]
+	vpaddq	ymm1,ymm1,ymm14
+	vpmuludq	ymm13,ymm13,ymm0
+	vpmuludq	ymm11,ymm11,ymm12
+	DB	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+	vpaddq	ymm13,ymm13,ymm1
+	vpaddq	ymm2,ymm2,ymm11
+	vpmuludq	ymm10,ymm10,ymm12
+	vmovdqu	ymm11,YMMWORD[((160-16-128))+r13]
+	DB	0x67
+	vmovq	rax,xmm13
+	vmovdqu	YMMWORD[rsp],ymm13
+	vpaddq	ymm3,ymm3,ymm10
+	vpmuludq	ymm14,ymm14,ymm12
+	vmovdqu	ymm10,YMMWORD[((192-16-128))+r13]
+	vpaddq	ymm4,ymm4,ymm14
+	vpmuludq	ymm11,ymm11,ymm12
+	vmovdqu	ymm14,YMMWORD[((224-16-128))+r13]
+	vpaddq	ymm5,ymm5,ymm11
+	vpmuludq	ymm10,ymm10,ymm12
+	vmovdqu	ymm11,YMMWORD[((256-16-128))+r13]
+	vpaddq	ymm6,ymm6,ymm10
+	vpmuludq	ymm14,ymm14,ymm12
+	shr	r12,29
+	vmovdqu	ymm10,YMMWORD[((288-16-128))+r13]
+	add	rax,r12
+	vpaddq	ymm7,ymm7,ymm14
+	vpmuludq	ymm11,ymm11,ymm12
+
+	mov	r9,rax
+	imul	eax,ecx
+	vpaddq	ymm8,ymm8,ymm11
+	vpmuludq	ymm10,ymm10,ymm12
+	and	eax,0x1fffffff
+	vmovd	xmm12,eax
+	vmovdqu	ymm11,YMMWORD[((96-24-128))+r13]
+	DB	0x67
+	vpaddq	ymm9,ymm9,ymm10
+	vpbroadcastq	ymm12,xmm12
+
+	vpmuludq	ymm14,ymm0,YMMWORD[((64-24-128))+r13]
+	vmovdqu	ymm10,YMMWORD[((128-24-128))+r13]
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+r13]
+	mov	r10,QWORD[8+rsp]
+	vpaddq	ymm1,ymm2,ymm14
+	vpmuludq	ymm11,ymm11,ymm0
+	vmovdqu	ymm14,YMMWORD[((160-24-128))+r13]
+	add	r9,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+r13]
+	DB	0x67
+	shr	r9,29
+	mov	r11,QWORD[16+rsp]
+	vpaddq	ymm2,ymm3,ymm11
+	vpmuludq	ymm10,ymm10,ymm0
+	vmovdqu	ymm11,YMMWORD[((192-24-128))+r13]
+	add	r10,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((16-128))+r13]
+	vpaddq	ymm3,ymm4,ymm10
+	vpmuludq	ymm14,ymm14,ymm0
+	vmovdqu	ymm10,YMMWORD[((224-24-128))+r13]
+	imul	rdx,QWORD[((24-128))+r13]
+	add	r11,rax
+	lea	rax,[r10*1+r9]
+	vpaddq	ymm4,ymm5,ymm14
+	vpmuludq	ymm11,ymm11,ymm0
+	vmovdqu	ymm14,YMMWORD[((256-24-128))+r13]
+	mov	r10,rax
+	imul	eax,ecx
+	vpmuludq	ymm10,ymm10,ymm0
+	vpaddq	ymm5,ymm6,ymm11
+	vmovdqu	ymm11,YMMWORD[((288-24-128))+r13]
+	and	eax,0x1fffffff
+	vpaddq	ymm6,ymm7,ymm10
+	vpmuludq	ymm14,ymm14,ymm0
+	add	rdx,QWORD[24+rsp]
+	vpaddq	ymm7,ymm8,ymm14
+	vpmuludq	ymm11,ymm11,ymm0
+	vpaddq	ymm8,ymm9,ymm11
+	vmovq	xmm9,r12
+	mov	r12,rdx
+
+	dec	r14d
+	jnz	NEAR $L$OOP_REDUCE_1024
+	lea	r12,[448+rsp]
+	vpaddq	ymm0,ymm13,ymm9
+	vpxor	ymm9,ymm9,ymm9
+
+	vpaddq	ymm0,ymm0,YMMWORD[((288-192))+rbx]
+	vpaddq	ymm1,ymm1,YMMWORD[((320-448))+r12]
+	vpaddq	ymm2,ymm2,YMMWORD[((352-448))+r12]
+	vpaddq	ymm3,ymm3,YMMWORD[((384-448))+r12]
+	vpaddq	ymm4,ymm4,YMMWORD[((416-448))+r12]
+	vpaddq	ymm5,ymm5,YMMWORD[((448-448))+r12]
+	vpaddq	ymm6,ymm6,YMMWORD[((480-448))+r12]
+	vpaddq	ymm7,ymm7,YMMWORD[((512-448))+r12]
+	vpaddq	ymm8,ymm8,YMMWORD[((544-448))+r12]
+
+	vpsrlq	ymm14,ymm0,29
+	vpand	ymm0,ymm0,ymm15
+	vpsrlq	ymm11,ymm1,29
+	vpand	ymm1,ymm1,ymm15
+	vpsrlq	ymm12,ymm2,29
+	vpermq	ymm14,ymm14,0x93
+	vpand	ymm2,ymm2,ymm15
+	vpsrlq	ymm13,ymm3,29
+	vpermq	ymm11,ymm11,0x93
+	vpand	ymm3,ymm3,ymm15
+	vpermq	ymm12,ymm12,0x93
+
+	vpblendd	ymm10,ymm14,ymm9,3
+	vpermq	ymm13,ymm13,0x93
+	vpblendd	ymm14,ymm11,ymm14,3
+	vpaddq	ymm0,ymm0,ymm10
+	vpblendd	ymm11,ymm12,ymm11,3
+	vpaddq	ymm1,ymm1,ymm14
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm2,ymm2,ymm11
+	vpblendd	ymm13,ymm9,ymm13,3
+	vpaddq	ymm3,ymm3,ymm12
+	vpaddq	ymm4,ymm4,ymm13
+
+	vpsrlq	ymm14,ymm0,29
+	vpand	ymm0,ymm0,ymm15
+	vpsrlq	ymm11,ymm1,29
+	vpand	ymm1,ymm1,ymm15
+	vpsrlq	ymm12,ymm2,29
+	vpermq	ymm14,ymm14,0x93
+	vpand	ymm2,ymm2,ymm15
+	vpsrlq	ymm13,ymm3,29
+	vpermq	ymm11,ymm11,0x93
+	vpand	ymm3,ymm3,ymm15
+	vpermq	ymm12,ymm12,0x93
+
+	vpblendd	ymm10,ymm14,ymm9,3
+	vpermq	ymm13,ymm13,0x93
+	vpblendd	ymm14,ymm11,ymm14,3
+	vpaddq	ymm0,ymm0,ymm10
+	vpblendd	ymm11,ymm12,ymm11,3
+	vpaddq	ymm1,ymm1,ymm14
+	vmovdqu	YMMWORD[(0-128)+rdi],ymm0
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm2,ymm2,ymm11
+	vmovdqu	YMMWORD[(32-128)+rdi],ymm1
+	vpblendd	ymm13,ymm9,ymm13,3
+	vpaddq	ymm3,ymm3,ymm12
+	vmovdqu	YMMWORD[(64-128)+rdi],ymm2
+	vpaddq	ymm4,ymm4,ymm13
+	vmovdqu	YMMWORD[(96-128)+rdi],ymm3
+	vpsrlq	ymm14,ymm4,29
+	vpand	ymm4,ymm4,ymm15
+	vpsrlq	ymm11,ymm5,29
+	vpand	ymm5,ymm5,ymm15
+	vpsrlq	ymm12,ymm6,29
+	vpermq	ymm14,ymm14,0x93
+	vpand	ymm6,ymm6,ymm15
+	vpsrlq	ymm13,ymm7,29
+	vpermq	ymm11,ymm11,0x93
+	vpand	ymm7,ymm7,ymm15
+	vpsrlq	ymm0,ymm8,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm8,ymm8,ymm15
+	vpermq	ymm13,ymm13,0x93
+
+	vpblendd	ymm10,ymm14,ymm9,3
+	vpermq	ymm0,ymm0,0x93
+	vpblendd	ymm14,ymm11,ymm14,3
+	vpaddq	ymm4,ymm4,ymm10
+	vpblendd	ymm11,ymm12,ymm11,3
+	vpaddq	ymm5,ymm5,ymm14
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm6,ymm6,ymm11
+	vpblendd	ymm13,ymm0,ymm13,3
+	vpaddq	ymm7,ymm7,ymm12
+	vpaddq	ymm8,ymm8,ymm13
+
+	vpsrlq	ymm14,ymm4,29
+	vpand	ymm4,ymm4,ymm15
+	vpsrlq	ymm11,ymm5,29
+	vpand	ymm5,ymm5,ymm15
+	vpsrlq	ymm12,ymm6,29
+	vpermq	ymm14,ymm14,0x93
+	vpand	ymm6,ymm6,ymm15
+	vpsrlq	ymm13,ymm7,29
+	vpermq	ymm11,ymm11,0x93
+	vpand	ymm7,ymm7,ymm15
+	vpsrlq	ymm0,ymm8,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm8,ymm8,ymm15
+	vpermq	ymm13,ymm13,0x93
+
+	vpblendd	ymm10,ymm14,ymm9,3
+	vpermq	ymm0,ymm0,0x93
+	vpblendd	ymm14,ymm11,ymm14,3
+	vpaddq	ymm4,ymm4,ymm10
+	vpblendd	ymm11,ymm12,ymm11,3
+	vpaddq	ymm5,ymm5,ymm14
+	vmovdqu	YMMWORD[(128-128)+rdi],ymm4
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm6,ymm6,ymm11
+	vmovdqu	YMMWORD[(160-128)+rdi],ymm5
+	vpblendd	ymm13,ymm0,ymm13,3
+	vpaddq	ymm7,ymm7,ymm12
+	vmovdqu	YMMWORD[(192-128)+rdi],ymm6
+	vpaddq	ymm8,ymm8,ymm13
+	vmovdqu	YMMWORD[(224-128)+rdi],ymm7
+	vmovdqu	YMMWORD[(256-128)+rdi],ymm8
+
+	mov	rsi,rdi
+	dec	r8d
+	jne	NEAR $L$OOP_GRANDE_SQR_1024
+
+	vzeroall
+	mov	rax,rbp
+
+$L$sqr_1024_in_tail:
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+
+	mov	r14,QWORD[((-40))+rax]
+
+	mov	r13,QWORD[((-32))+rax]
+
+	mov	r12,QWORD[((-24))+rax]
+
+	mov	rbp,QWORD[((-16))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+
+	lea	rsp,[rax]
+
+$L$sqr_1024_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_rsaz_1024_sqr_avx2:
+global	rsaz_1024_mul_avx2
+
+ALIGN	64
+rsaz_1024_mul_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_rsaz_1024_mul_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	lea	rax,[rsp]
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	vzeroupper
+	lea	rsp,[((-168))+rsp]
+	vmovaps	XMMWORD[(-216)+rax],xmm6
+	vmovaps	XMMWORD[(-200)+rax],xmm7
+	vmovaps	XMMWORD[(-184)+rax],xmm8
+	vmovaps	XMMWORD[(-168)+rax],xmm9
+	vmovaps	XMMWORD[(-152)+rax],xmm10
+	vmovaps	XMMWORD[(-136)+rax],xmm11
+	vmovaps	XMMWORD[(-120)+rax],xmm12
+	vmovaps	XMMWORD[(-104)+rax],xmm13
+	vmovaps	XMMWORD[(-88)+rax],xmm14
+	vmovaps	XMMWORD[(-72)+rax],xmm15
+$L$mul_1024_body:
+	mov	rbp,rax
+
+	vzeroall
+	mov	r13,rdx
+	sub	rsp,64
+
+
+
+
+
+
+	DB	0x67,0x67
+	mov	r15,rsi
+	and	r15,4095
+	add	r15,32*10
+	shr	r15,12
+	mov	r15,rsi
+	cmovnz	rsi,r13
+	cmovnz	r13,r15
+
+	mov	r15,rcx
+	sub	rsi,-128
+	sub	rcx,-128
+	sub	rdi,-128
+
+	and	r15,4095
+	add	r15,32*10
+	DB	0x67,0x67
+	shr	r15,12
+	jz	NEAR $L$mul_1024_no_n_copy
+
+
+
+
+
+	sub	rsp,32*10
+	vmovdqu	ymm0,YMMWORD[((0-128))+rcx]
+	and	rsp,-512
+	vmovdqu	ymm1,YMMWORD[((32-128))+rcx]
+	vmovdqu	ymm2,YMMWORD[((64-128))+rcx]
+	vmovdqu	ymm3,YMMWORD[((96-128))+rcx]
+	vmovdqu	ymm4,YMMWORD[((128-128))+rcx]
+	vmovdqu	ymm5,YMMWORD[((160-128))+rcx]
+	vmovdqu	ymm6,YMMWORD[((192-128))+rcx]
+	vmovdqu	ymm7,YMMWORD[((224-128))+rcx]
+	vmovdqu	ymm8,YMMWORD[((256-128))+rcx]
+	lea	rcx,[((64+128))+rsp]
+	vmovdqu	YMMWORD[(0-128)+rcx],ymm0
+	vpxor	ymm0,ymm0,ymm0
+	vmovdqu	YMMWORD[(32-128)+rcx],ymm1
+	vpxor	ymm1,ymm1,ymm1
+	vmovdqu	YMMWORD[(64-128)+rcx],ymm2
+	vpxor	ymm2,ymm2,ymm2
+	vmovdqu	YMMWORD[(96-128)+rcx],ymm3
+	vpxor	ymm3,ymm3,ymm3
+	vmovdqu	YMMWORD[(128-128)+rcx],ymm4
+	vpxor	ymm4,ymm4,ymm4
+	vmovdqu	YMMWORD[(160-128)+rcx],ymm5
+	vpxor	ymm5,ymm5,ymm5
+	vmovdqu	YMMWORD[(192-128)+rcx],ymm6
+	vpxor	ymm6,ymm6,ymm6
+	vmovdqu	YMMWORD[(224-128)+rcx],ymm7
+	vpxor	ymm7,ymm7,ymm7
+	vmovdqu	YMMWORD[(256-128)+rcx],ymm8
+	vmovdqa	ymm8,ymm0
+	vmovdqu	YMMWORD[(288-128)+rcx],ymm9
+$L$mul_1024_no_n_copy:
+	and	rsp,-64
+
+	mov	rbx,QWORD[r13]
+	vpbroadcastq	ymm10,QWORD[r13]
+	vmovdqu	YMMWORD[rsp],ymm0
+	xor	r9,r9
+	DB	0x67
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+
+	vmovdqu	ymm15,YMMWORD[$L$and_mask]
+	mov	r14d,9
+	vmovdqu	YMMWORD[(288-128)+rdi],ymm9
+	jmp	NEAR $L$oop_mul_1024
+
+ALIGN	32
+$L$oop_mul_1024:
+	vpsrlq	ymm9,ymm3,29
+	mov	rax,rbx
+	imul	rax,QWORD[((-128))+rsi]
+	add	rax,r9
+	mov	r10,rbx
+	imul	r10,QWORD[((8-128))+rsi]
+	add	r10,QWORD[8+rsp]
+
+	mov	r9,rax
+	imul	eax,r8d
+	and	eax,0x1fffffff
+
+	mov	r11,rbx
+	imul	r11,QWORD[((16-128))+rsi]
+	add	r11,QWORD[16+rsp]
+
+	mov	r12,rbx
+	imul	r12,QWORD[((24-128))+rsi]
+	add	r12,QWORD[24+rsp]
+	vpmuludq	ymm0,ymm10,YMMWORD[((32-128))+rsi]
+	vmovd	xmm11,eax
+	vpaddq	ymm1,ymm1,ymm0
+	vpmuludq	ymm12,ymm10,YMMWORD[((64-128))+rsi]
+	vpbroadcastq	ymm11,xmm11
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm13,ymm10,YMMWORD[((96-128))+rsi]
+	vpand	ymm3,ymm3,ymm15
+	vpaddq	ymm3,ymm3,ymm13
+	vpmuludq	ymm0,ymm10,YMMWORD[((128-128))+rsi]
+	vpaddq	ymm4,ymm4,ymm0
+	vpmuludq	ymm12,ymm10,YMMWORD[((160-128))+rsi]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm13,ymm10,YMMWORD[((192-128))+rsi]
+	vpaddq	ymm6,ymm6,ymm13
+	vpmuludq	ymm0,ymm10,YMMWORD[((224-128))+rsi]
+	vpermq	ymm9,ymm9,0x93
+	vpaddq	ymm7,ymm7,ymm0
+	vpmuludq	ymm12,ymm10,YMMWORD[((256-128))+rsi]
+	vpbroadcastq	ymm10,QWORD[8+r13]
+	vpaddq	ymm8,ymm8,ymm12
+
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+rcx]
+	add	r9,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+rcx]
+	add	r10,rax
+	mov	rax,rdx
+	imul	rax,QWORD[((16-128))+rcx]
+	add	r11,rax
+	shr	r9,29
+	imul	rdx,QWORD[((24-128))+rcx]
+	add	r12,rdx
+	add	r10,r9
+
+	vpmuludq	ymm13,ymm11,YMMWORD[((32-128))+rcx]
+	vmovq	rbx,xmm10
+	vpaddq	ymm1,ymm1,ymm13
+	vpmuludq	ymm0,ymm11,YMMWORD[((64-128))+rcx]
+	vpaddq	ymm2,ymm2,ymm0
+	vpmuludq	ymm12,ymm11,YMMWORD[((96-128))+rcx]
+	vpaddq	ymm3,ymm3,ymm12
+	vpmuludq	ymm13,ymm11,YMMWORD[((128-128))+rcx]
+	vpaddq	ymm4,ymm4,ymm13
+	vpmuludq	ymm0,ymm11,YMMWORD[((160-128))+rcx]
+	vpaddq	ymm5,ymm5,ymm0
+	vpmuludq	ymm12,ymm11,YMMWORD[((192-128))+rcx]
+	vpaddq	ymm6,ymm6,ymm12
+	vpmuludq	ymm13,ymm11,YMMWORD[((224-128))+rcx]
+	vpblendd	ymm12,ymm9,ymm14,3
+	vpaddq	ymm7,ymm7,ymm13
+	vpmuludq	ymm0,ymm11,YMMWORD[((256-128))+rcx]
+	vpaddq	ymm3,ymm3,ymm12
+	vpaddq	ymm8,ymm8,ymm0
+
+	mov	rax,rbx
+	imul	rax,QWORD[((-128))+rsi]
+	add	r10,rax
+	vmovdqu	ymm12,YMMWORD[((-8+32-128))+rsi]
+	mov	rax,rbx
+	imul	rax,QWORD[((8-128))+rsi]
+	add	r11,rax
+	vmovdqu	ymm13,YMMWORD[((-8+64-128))+rsi]
+
+	mov	rax,r10
+	vpblendd	ymm9,ymm9,ymm14,0xfc
+	imul	eax,r8d
+	vpaddq	ymm4,ymm4,ymm9
+	and	eax,0x1fffffff
+
+	imul	rbx,QWORD[((16-128))+rsi]
+	add	r12,rbx
+	vpmuludq	ymm12,ymm12,ymm10
+	vmovd	xmm11,eax
+	vmovdqu	ymm0,YMMWORD[((-8+96-128))+rsi]
+	vpaddq	ymm1,ymm1,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vpbroadcastq	ymm11,xmm11
+	vmovdqu	ymm12,YMMWORD[((-8+128-128))+rsi]
+	vpaddq	ymm2,ymm2,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-8+160-128))+rsi]
+	vpaddq	ymm3,ymm3,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vmovdqu	ymm0,YMMWORD[((-8+192-128))+rsi]
+	vpaddq	ymm4,ymm4,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vmovdqu	ymm12,YMMWORD[((-8+224-128))+rsi]
+	vpaddq	ymm5,ymm5,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-8+256-128))+rsi]
+	vpaddq	ymm6,ymm6,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vmovdqu	ymm9,YMMWORD[((-8+288-128))+rsi]
+	vpaddq	ymm7,ymm7,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vpaddq	ymm8,ymm8,ymm13
+	vpmuludq	ymm9,ymm9,ymm10
+	vpbroadcastq	ymm10,QWORD[16+r13]
+
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+rcx]
+	add	r10,rax
+	vmovdqu	ymm0,YMMWORD[((-8+32-128))+rcx]
+	mov	rax,rdx
+	imul	rax,QWORD[((8-128))+rcx]
+	add	r11,rax
+	vmovdqu	ymm12,YMMWORD[((-8+64-128))+rcx]
+	shr	r10,29
+	imul	rdx,QWORD[((16-128))+rcx]
+	add	r12,rdx
+	add	r11,r10
+
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovq	rbx,xmm10
+	vmovdqu	ymm13,YMMWORD[((-8+96-128))+rcx]
+	vpaddq	ymm1,ymm1,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-8+128-128))+rcx]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-8+160-128))+rcx]
+	vpaddq	ymm3,ymm3,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-8+192-128))+rcx]
+	vpaddq	ymm4,ymm4,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-8+224-128))+rcx]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-8+256-128))+rcx]
+	vpaddq	ymm6,ymm6,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-8+288-128))+rcx]
+	vpaddq	ymm7,ymm7,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vpaddq	ymm9,ymm9,ymm13
+
+	vmovdqu	ymm0,YMMWORD[((-16+32-128))+rsi]
+	mov	rax,rbx
+	imul	rax,QWORD[((-128))+rsi]
+	add	rax,r11
+
+	vmovdqu	ymm12,YMMWORD[((-16+64-128))+rsi]
+	mov	r11,rax
+	imul	eax,r8d
+	and	eax,0x1fffffff
+
+	imul	rbx,QWORD[((8-128))+rsi]
+	add	r12,rbx
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovd	xmm11,eax
+	vmovdqu	ymm13,YMMWORD[((-16+96-128))+rsi]
+	vpaddq	ymm1,ymm1,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vpbroadcastq	ymm11,xmm11
+	vmovdqu	ymm0,YMMWORD[((-16+128-128))+rsi]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vmovdqu	ymm12,YMMWORD[((-16+160-128))+rsi]
+	vpaddq	ymm3,ymm3,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-16+192-128))+rsi]
+	vpaddq	ymm4,ymm4,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vmovdqu	ymm0,YMMWORD[((-16+224-128))+rsi]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vmovdqu	ymm12,YMMWORD[((-16+256-128))+rsi]
+	vpaddq	ymm6,ymm6,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-16+288-128))+rsi]
+	vpaddq	ymm7,ymm7,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vpbroadcastq	ymm10,QWORD[24+r13]
+	vpaddq	ymm9,ymm9,ymm13
+
+	vmovdqu	ymm0,YMMWORD[((-16+32-128))+rcx]
+	mov	rdx,rax
+	imul	rax,QWORD[((-128))+rcx]
+	add	r11,rax
+	vmovdqu	ymm12,YMMWORD[((-16+64-128))+rcx]
+	imul	rdx,QWORD[((8-128))+rcx]
+	add	r12,rdx
+	shr	r11,29
+
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovq	rbx,xmm10
+	vmovdqu	ymm13,YMMWORD[((-16+96-128))+rcx]
+	vpaddq	ymm1,ymm1,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-16+128-128))+rcx]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-16+160-128))+rcx]
+	vpaddq	ymm3,ymm3,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-16+192-128))+rcx]
+	vpaddq	ymm4,ymm4,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-16+224-128))+rcx]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-16+256-128))+rcx]
+	vpaddq	ymm6,ymm6,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-16+288-128))+rcx]
+	vpaddq	ymm7,ymm7,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-24+32-128))+rsi]
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-24+64-128))+rsi]
+	vpaddq	ymm9,ymm9,ymm13
+
+	add	r12,r11
+	imul	rbx,QWORD[((-128))+rsi]
+	add	r12,rbx
+
+	mov	rax,r12
+	imul	eax,r8d
+	and	eax,0x1fffffff
+
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovd	xmm11,eax
+	vmovdqu	ymm13,YMMWORD[((-24+96-128))+rsi]
+	vpaddq	ymm1,ymm1,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vpbroadcastq	ymm11,xmm11
+	vmovdqu	ymm0,YMMWORD[((-24+128-128))+rsi]
+	vpaddq	ymm2,ymm2,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vmovdqu	ymm12,YMMWORD[((-24+160-128))+rsi]
+	vpaddq	ymm3,ymm3,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-24+192-128))+rsi]
+	vpaddq	ymm4,ymm4,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vmovdqu	ymm0,YMMWORD[((-24+224-128))+rsi]
+	vpaddq	ymm5,ymm5,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vmovdqu	ymm12,YMMWORD[((-24+256-128))+rsi]
+	vpaddq	ymm6,ymm6,ymm13
+	vpmuludq	ymm0,ymm0,ymm10
+	vmovdqu	ymm13,YMMWORD[((-24+288-128))+rsi]
+	vpaddq	ymm7,ymm7,ymm0
+	vpmuludq	ymm12,ymm12,ymm10
+	vpaddq	ymm8,ymm8,ymm12
+	vpmuludq	ymm13,ymm13,ymm10
+	vpbroadcastq	ymm10,QWORD[32+r13]
+	vpaddq	ymm9,ymm9,ymm13
+	add	r13,32
+
+	vmovdqu	ymm0,YMMWORD[((-24+32-128))+rcx]
+	imul	rax,QWORD[((-128))+rcx]
+	add	r12,rax
+	shr	r12,29
+
+	vmovdqu	ymm12,YMMWORD[((-24+64-128))+rcx]
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovq	rbx,xmm10
+	vmovdqu	ymm13,YMMWORD[((-24+96-128))+rcx]
+	vpaddq	ymm0,ymm1,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	YMMWORD[rsp],ymm0
+	vpaddq	ymm1,ymm2,ymm12
+	vmovdqu	ymm0,YMMWORD[((-24+128-128))+rcx]
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-24+160-128))+rcx]
+	vpaddq	ymm2,ymm3,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-24+192-128))+rcx]
+	vpaddq	ymm3,ymm4,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	vmovdqu	ymm0,YMMWORD[((-24+224-128))+rcx]
+	vpaddq	ymm4,ymm5,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovdqu	ymm12,YMMWORD[((-24+256-128))+rcx]
+	vpaddq	ymm5,ymm6,ymm13
+	vpmuludq	ymm0,ymm0,ymm11
+	vmovdqu	ymm13,YMMWORD[((-24+288-128))+rcx]
+	mov	r9,r12
+	vpaddq	ymm6,ymm7,ymm0
+	vpmuludq	ymm12,ymm12,ymm11
+	add	r9,QWORD[rsp]
+	vpaddq	ymm7,ymm8,ymm12
+	vpmuludq	ymm13,ymm13,ymm11
+	vmovq	xmm12,r12
+	vpaddq	ymm8,ymm9,ymm13
+
+	dec	r14d
+	jnz	NEAR $L$oop_mul_1024
+	vpaddq	ymm0,ymm12,YMMWORD[rsp]
+
+	vpsrlq	ymm12,ymm0,29
+	vpand	ymm0,ymm0,ymm15
+	vpsrlq	ymm13,ymm1,29
+	vpand	ymm1,ymm1,ymm15
+	vpsrlq	ymm10,ymm2,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm2,ymm2,ymm15
+	vpsrlq	ymm11,ymm3,29
+	vpermq	ymm13,ymm13,0x93
+	vpand	ymm3,ymm3,ymm15
+
+	vpblendd	ymm9,ymm12,ymm14,3
+	vpermq	ymm10,ymm10,0x93
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpermq	ymm11,ymm11,0x93
+	vpaddq	ymm0,ymm0,ymm9
+	vpblendd	ymm13,ymm10,ymm13,3
+	vpaddq	ymm1,ymm1,ymm12
+	vpblendd	ymm10,ymm11,ymm10,3
+	vpaddq	ymm2,ymm2,ymm13
+	vpblendd	ymm11,ymm14,ymm11,3
+	vpaddq	ymm3,ymm3,ymm10
+	vpaddq	ymm4,ymm4,ymm11
+
+	vpsrlq	ymm12,ymm0,29
+	vpand	ymm0,ymm0,ymm15
+	vpsrlq	ymm13,ymm1,29
+	vpand	ymm1,ymm1,ymm15
+	vpsrlq	ymm10,ymm2,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm2,ymm2,ymm15
+	vpsrlq	ymm11,ymm3,29
+	vpermq	ymm13,ymm13,0x93
+	vpand	ymm3,ymm3,ymm15
+	vpermq	ymm10,ymm10,0x93
+
+	vpblendd	ymm9,ymm12,ymm14,3
+	vpermq	ymm11,ymm11,0x93
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm0,ymm0,ymm9
+	vpblendd	ymm13,ymm10,ymm13,3
+	vpaddq	ymm1,ymm1,ymm12
+	vpblendd	ymm10,ymm11,ymm10,3
+	vpaddq	ymm2,ymm2,ymm13
+	vpblendd	ymm11,ymm14,ymm11,3
+	vpaddq	ymm3,ymm3,ymm10
+	vpaddq	ymm4,ymm4,ymm11
+
+	vmovdqu	YMMWORD[(0-128)+rdi],ymm0
+	vmovdqu	YMMWORD[(32-128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64-128)+rdi],ymm2
+	vmovdqu	YMMWORD[(96-128)+rdi],ymm3
+	vpsrlq	ymm12,ymm4,29
+	vpand	ymm4,ymm4,ymm15
+	vpsrlq	ymm13,ymm5,29
+	vpand	ymm5,ymm5,ymm15
+	vpsrlq	ymm10,ymm6,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm6,ymm6,ymm15
+	vpsrlq	ymm11,ymm7,29
+	vpermq	ymm13,ymm13,0x93
+	vpand	ymm7,ymm7,ymm15
+	vpsrlq	ymm0,ymm8,29
+	vpermq	ymm10,ymm10,0x93
+	vpand	ymm8,ymm8,ymm15
+	vpermq	ymm11,ymm11,0x93
+
+	vpblendd	ymm9,ymm12,ymm14,3
+	vpermq	ymm0,ymm0,0x93
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm4,ymm4,ymm9
+	vpblendd	ymm13,ymm10,ymm13,3
+	vpaddq	ymm5,ymm5,ymm12
+	vpblendd	ymm10,ymm11,ymm10,3
+	vpaddq	ymm6,ymm6,ymm13
+	vpblendd	ymm11,ymm0,ymm11,3
+	vpaddq	ymm7,ymm7,ymm10
+	vpaddq	ymm8,ymm8,ymm11
+
+	vpsrlq	ymm12,ymm4,29
+	vpand	ymm4,ymm4,ymm15
+	vpsrlq	ymm13,ymm5,29
+	vpand	ymm5,ymm5,ymm15
+	vpsrlq	ymm10,ymm6,29
+	vpermq	ymm12,ymm12,0x93
+	vpand	ymm6,ymm6,ymm15
+	vpsrlq	ymm11,ymm7,29
+	vpermq	ymm13,ymm13,0x93
+	vpand	ymm7,ymm7,ymm15
+	vpsrlq	ymm0,ymm8,29
+	vpermq	ymm10,ymm10,0x93
+	vpand	ymm8,ymm8,ymm15
+	vpermq	ymm11,ymm11,0x93
+
+	vpblendd	ymm9,ymm12,ymm14,3
+	vpermq	ymm0,ymm0,0x93
+	vpblendd	ymm12,ymm13,ymm12,3
+	vpaddq	ymm4,ymm4,ymm9
+	vpblendd	ymm13,ymm10,ymm13,3
+	vpaddq	ymm5,ymm5,ymm12
+	vpblendd	ymm10,ymm11,ymm10,3
+	vpaddq	ymm6,ymm6,ymm13
+	vpblendd	ymm11,ymm0,ymm11,3
+	vpaddq	ymm7,ymm7,ymm10
+	vpaddq	ymm8,ymm8,ymm11
+
+	vmovdqu	YMMWORD[(128-128)+rdi],ymm4
+	vmovdqu	YMMWORD[(160-128)+rdi],ymm5
+	vmovdqu	YMMWORD[(192-128)+rdi],ymm6
+	vmovdqu	YMMWORD[(224-128)+rdi],ymm7
+	vmovdqu	YMMWORD[(256-128)+rdi],ymm8
+	vzeroupper
+
+	mov	rax,rbp
+
+$L$mul_1024_in_tail:
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+
+	mov	r14,QWORD[((-40))+rax]
+
+	mov	r13,QWORD[((-32))+rax]
+
+	mov	r12,QWORD[((-24))+rax]
+
+	mov	rbp,QWORD[((-16))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+
+	lea	rsp,[rax]
+
+$L$mul_1024_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_rsaz_1024_mul_avx2:
+global	rsaz_1024_red2norm_avx2
+
+ALIGN	32
+rsaz_1024_red2norm_avx2:
+
+_CET_ENDBR
+	sub	rdx,-128
+	xor	rax,rax
+	mov	r8,QWORD[((-128))+rdx]
+	mov	r9,QWORD[((-120))+rdx]
+	mov	r10,QWORD[((-112))+rdx]
+	shl	r8,0
+	shl	r9,29
+	mov	r11,r10
+	shl	r10,58
+	shr	r11,6
+	add	rax,r8
+	add	rax,r9
+	add	rax,r10
+	adc	r11,0
+	mov	QWORD[rcx],rax
+	mov	rax,r11
+	mov	r8,QWORD[((-104))+rdx]
+	mov	r9,QWORD[((-96))+rdx]
+	shl	r8,23
+	mov	r10,r9
+	shl	r9,52
+	shr	r10,12
+	add	rax,r8
+	add	rax,r9
+	adc	r10,0
+	mov	QWORD[8+rcx],rax
+	mov	rax,r10
+	mov	r11,QWORD[((-88))+rdx]
+	mov	r8,QWORD[((-80))+rdx]
+	shl	r11,17
+	mov	r9,r8
+	shl	r8,46
+	shr	r9,18
+	add	rax,r11
+	add	rax,r8
+	adc	r9,0
+	mov	QWORD[16+rcx],rax
+	mov	rax,r9
+	mov	r10,QWORD[((-72))+rdx]
+	mov	r11,QWORD[((-64))+rdx]
+	shl	r10,11
+	mov	r8,r11
+	shl	r11,40
+	shr	r8,24
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[24+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[((-56))+rdx]
+	mov	r10,QWORD[((-48))+rdx]
+	mov	r11,QWORD[((-40))+rdx]
+	shl	r9,5
+	shl	r10,34
+	mov	r8,r11
+	shl	r11,63
+	shr	r8,1
+	add	rax,r9
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[32+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[((-32))+rdx]
+	mov	r10,QWORD[((-24))+rdx]
+	shl	r9,28
+	mov	r11,r10
+	shl	r10,57
+	shr	r11,7
+	add	rax,r9
+	add	rax,r10
+	adc	r11,0
+	mov	QWORD[40+rcx],rax
+	mov	rax,r11
+	mov	r8,QWORD[((-16))+rdx]
+	mov	r9,QWORD[((-8))+rdx]
+	shl	r8,22
+	mov	r10,r9
+	shl	r9,51
+	shr	r10,13
+	add	rax,r8
+	add	rax,r9
+	adc	r10,0
+	mov	QWORD[48+rcx],rax
+	mov	rax,r10
+	mov	r11,QWORD[rdx]
+	mov	r8,QWORD[8+rdx]
+	shl	r11,16
+	mov	r9,r8
+	shl	r8,45
+	shr	r9,19
+	add	rax,r11
+	add	rax,r8
+	adc	r9,0
+	mov	QWORD[56+rcx],rax
+	mov	rax,r9
+	mov	r10,QWORD[16+rdx]
+	mov	r11,QWORD[24+rdx]
+	shl	r10,10
+	mov	r8,r11
+	shl	r11,39
+	shr	r8,25
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[64+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[32+rdx]
+	mov	r10,QWORD[40+rdx]
+	mov	r11,QWORD[48+rdx]
+	shl	r9,4
+	shl	r10,33
+	mov	r8,r11
+	shl	r11,62
+	shr	r8,2
+	add	rax,r9
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[72+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[56+rdx]
+	mov	r10,QWORD[64+rdx]
+	shl	r9,27
+	mov	r11,r10
+	shl	r10,56
+	shr	r11,8
+	add	rax,r9
+	add	rax,r10
+	adc	r11,0
+	mov	QWORD[80+rcx],rax
+	mov	rax,r11
+	mov	r8,QWORD[72+rdx]
+	mov	r9,QWORD[80+rdx]
+	shl	r8,21
+	mov	r10,r9
+	shl	r9,50
+	shr	r10,14
+	add	rax,r8
+	add	rax,r9
+	adc	r10,0
+	mov	QWORD[88+rcx],rax
+	mov	rax,r10
+	mov	r11,QWORD[88+rdx]
+	mov	r8,QWORD[96+rdx]
+	shl	r11,15
+	mov	r9,r8
+	shl	r8,44
+	shr	r9,20
+	add	rax,r11
+	add	rax,r8
+	adc	r9,0
+	mov	QWORD[96+rcx],rax
+	mov	rax,r9
+	mov	r10,QWORD[104+rdx]
+	mov	r11,QWORD[112+rdx]
+	shl	r10,9
+	mov	r8,r11
+	shl	r11,38
+	shr	r8,26
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[104+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[120+rdx]
+	mov	r10,QWORD[128+rdx]
+	mov	r11,QWORD[136+rdx]
+	shl	r9,3
+	shl	r10,32
+	mov	r8,r11
+	shl	r11,61
+	shr	r8,3
+	add	rax,r9
+	add	rax,r10
+	add	rax,r11
+	adc	r8,0
+	mov	QWORD[112+rcx],rax
+	mov	rax,r8
+	mov	r9,QWORD[144+rdx]
+	mov	r10,QWORD[152+rdx]
+	shl	r9,26
+	mov	r11,r10
+	shl	r10,55
+	shr	r11,9
+	add	rax,r9
+	add	rax,r10
+	adc	r11,0
+	mov	QWORD[120+rcx],rax
+	mov	rax,r11
+	ret
+
+
+
+global	rsaz_1024_norm2red_avx2
+
+ALIGN	32
+rsaz_1024_norm2red_avx2:
+
+_CET_ENDBR
+	sub	rcx,-128
+	mov	r8,QWORD[rdx]
+	mov	eax,0x1fffffff
+	mov	r9,QWORD[8+rdx]
+	mov	r11,r8
+	shr	r11,0
+	and	r11,rax
+	mov	QWORD[((-128))+rcx],r11
+	mov	r10,r8
+	shr	r10,29
+	and	r10,rax
+	mov	QWORD[((-120))+rcx],r10
+	shrd	r8,r9,58
+	and	r8,rax
+	mov	QWORD[((-112))+rcx],r8
+	mov	r10,QWORD[16+rdx]
+	mov	r8,r9
+	shr	r8,23
+	and	r8,rax
+	mov	QWORD[((-104))+rcx],r8
+	shrd	r9,r10,52
+	and	r9,rax
+	mov	QWORD[((-96))+rcx],r9
+	mov	r11,QWORD[24+rdx]
+	mov	r9,r10
+	shr	r9,17
+	and	r9,rax
+	mov	QWORD[((-88))+rcx],r9
+	shrd	r10,r11,46
+	and	r10,rax
+	mov	QWORD[((-80))+rcx],r10
+	mov	r8,QWORD[32+rdx]
+	mov	r10,r11
+	shr	r10,11
+	and	r10,rax
+	mov	QWORD[((-72))+rcx],r10
+	shrd	r11,r8,40
+	and	r11,rax
+	mov	QWORD[((-64))+rcx],r11
+	mov	r9,QWORD[40+rdx]
+	mov	r11,r8
+	shr	r11,5
+	and	r11,rax
+	mov	QWORD[((-56))+rcx],r11
+	mov	r10,r8
+	shr	r10,34
+	and	r10,rax
+	mov	QWORD[((-48))+rcx],r10
+	shrd	r8,r9,63
+	and	r8,rax
+	mov	QWORD[((-40))+rcx],r8
+	mov	r10,QWORD[48+rdx]
+	mov	r8,r9
+	shr	r8,28
+	and	r8,rax
+	mov	QWORD[((-32))+rcx],r8
+	shrd	r9,r10,57
+	and	r9,rax
+	mov	QWORD[((-24))+rcx],r9
+	mov	r11,QWORD[56+rdx]
+	mov	r9,r10
+	shr	r9,22
+	and	r9,rax
+	mov	QWORD[((-16))+rcx],r9
+	shrd	r10,r11,51
+	and	r10,rax
+	mov	QWORD[((-8))+rcx],r10
+	mov	r8,QWORD[64+rdx]
+	mov	r10,r11
+	shr	r10,16
+	and	r10,rax
+	mov	QWORD[rcx],r10
+	shrd	r11,r8,45
+	and	r11,rax
+	mov	QWORD[8+rcx],r11
+	mov	r9,QWORD[72+rdx]
+	mov	r11,r8
+	shr	r11,10
+	and	r11,rax
+	mov	QWORD[16+rcx],r11
+	shrd	r8,r9,39
+	and	r8,rax
+	mov	QWORD[24+rcx],r8
+	mov	r10,QWORD[80+rdx]
+	mov	r8,r9
+	shr	r8,4
+	and	r8,rax
+	mov	QWORD[32+rcx],r8
+	mov	r11,r9
+	shr	r11,33
+	and	r11,rax
+	mov	QWORD[40+rcx],r11
+	shrd	r9,r10,62
+	and	r9,rax
+	mov	QWORD[48+rcx],r9
+	mov	r11,QWORD[88+rdx]
+	mov	r9,r10
+	shr	r9,27
+	and	r9,rax
+	mov	QWORD[56+rcx],r9
+	shrd	r10,r11,56
+	and	r10,rax
+	mov	QWORD[64+rcx],r10
+	mov	r8,QWORD[96+rdx]
+	mov	r10,r11
+	shr	r10,21
+	and	r10,rax
+	mov	QWORD[72+rcx],r10
+	shrd	r11,r8,50
+	and	r11,rax
+	mov	QWORD[80+rcx],r11
+	mov	r9,QWORD[104+rdx]
+	mov	r11,r8
+	shr	r11,15
+	and	r11,rax
+	mov	QWORD[88+rcx],r11
+	shrd	r8,r9,44
+	and	r8,rax
+	mov	QWORD[96+rcx],r8
+	mov	r10,QWORD[112+rdx]
+	mov	r8,r9
+	shr	r8,9
+	and	r8,rax
+	mov	QWORD[104+rcx],r8
+	shrd	r9,r10,38
+	and	r9,rax
+	mov	QWORD[112+rcx],r9
+	mov	r11,QWORD[120+rdx]
+	mov	r9,r10
+	shr	r9,3
+	and	r9,rax
+	mov	QWORD[120+rcx],r9
+	mov	r8,r10
+	shr	r8,32
+	and	r8,rax
+	mov	QWORD[128+rcx],r8
+	shrd	r10,r11,61
+	and	r10,rax
+	mov	QWORD[136+rcx],r10
+	xor	r8,r8
+	mov	r10,r11
+	shr	r10,26
+	and	r10,rax
+	mov	QWORD[144+rcx],r10
+	shrd	r11,r8,55
+	and	r11,rax
+	mov	QWORD[152+rcx],r11
+	mov	QWORD[160+rcx],r8
+	mov	QWORD[168+rcx],r8
+	mov	QWORD[176+rcx],r8
+	mov	QWORD[184+rcx],r8
+	ret
+
+
+global	rsaz_1024_scatter5_avx2
+
+ALIGN	32
+rsaz_1024_scatter5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	vmovdqu	ymm5,YMMWORD[$L$scatter_permd]
+	shl	r8d,4
+	lea	rcx,[r8*1+rcx]
+	mov	eax,9
+	jmp	NEAR $L$oop_scatter_1024
+
+ALIGN	32
+$L$oop_scatter_1024:
+	vmovdqu	ymm0,YMMWORD[rdx]
+	lea	rdx,[32+rdx]
+	vpermd	ymm0,ymm5,ymm0
+	vmovdqu	XMMWORD[rcx],xmm0
+	lea	rcx,[512+rcx]
+	dec	eax
+	jnz	NEAR $L$oop_scatter_1024
+
+	vzeroupper
+	ret
+
+
+
+global	rsaz_1024_gather5_avx2
+
+ALIGN	32
+rsaz_1024_gather5_avx2:
+
+_CET_ENDBR
+	vzeroupper
+	mov	r11,rsp
+
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_rsaz_1024_gather5:
+
+	DB	0x48,0x8d,0x60,0xe0
+	DB	0xc5,0xf8,0x29,0x70,0xe0
+	DB	0xc5,0xf8,0x29,0x78,0xf0
+	DB	0xc5,0x78,0x29,0x40,0x00
+	DB	0xc5,0x78,0x29,0x48,0x10
+	DB	0xc5,0x78,0x29,0x50,0x20
+	DB	0xc5,0x78,0x29,0x58,0x30
+	DB	0xc5,0x78,0x29,0x60,0x40
+	DB	0xc5,0x78,0x29,0x68,0x50
+	DB	0xc5,0x78,0x29,0x70,0x60
+	DB	0xc5,0x78,0x29,0x78,0x70
+	lea	rsp,[((-256))+rsp]
+	and	rsp,-32
+	lea	r10,[$L$inc]
+	lea	rax,[((-128))+rsp]
+
+	vmovd	xmm4,r8d
+	vmovdqa	ymm0,YMMWORD[r10]
+	vmovdqa	ymm1,YMMWORD[32+r10]
+	vmovdqa	ymm5,YMMWORD[64+r10]
+	vpbroadcastd	ymm4,xmm4
+
+	vpaddd	ymm2,ymm0,ymm5
+	vpcmpeqd	ymm0,ymm0,ymm4
+	vpaddd	ymm3,ymm1,ymm5
+	vpcmpeqd	ymm1,ymm1,ymm4
+	vmovdqa	YMMWORD[(0+128)+rax],ymm0
+	vpaddd	ymm0,ymm2,ymm5
+	vpcmpeqd	ymm2,ymm2,ymm4
+	vmovdqa	YMMWORD[(32+128)+rax],ymm1
+	vpaddd	ymm1,ymm3,ymm5
+	vpcmpeqd	ymm3,ymm3,ymm4
+	vmovdqa	YMMWORD[(64+128)+rax],ymm2
+	vpaddd	ymm2,ymm0,ymm5
+	vpcmpeqd	ymm0,ymm0,ymm4
+	vmovdqa	YMMWORD[(96+128)+rax],ymm3
+	vpaddd	ymm3,ymm1,ymm5
+	vpcmpeqd	ymm1,ymm1,ymm4
+	vmovdqa	YMMWORD[(128+128)+rax],ymm0
+	vpaddd	ymm8,ymm2,ymm5
+	vpcmpeqd	ymm2,ymm2,ymm4
+	vmovdqa	YMMWORD[(160+128)+rax],ymm1
+	vpaddd	ymm9,ymm3,ymm5
+	vpcmpeqd	ymm3,ymm3,ymm4
+	vmovdqa	YMMWORD[(192+128)+rax],ymm2
+	vpaddd	ymm10,ymm8,ymm5
+	vpcmpeqd	ymm8,ymm8,ymm4
+	vmovdqa	YMMWORD[(224+128)+rax],ymm3
+	vpaddd	ymm11,ymm9,ymm5
+	vpcmpeqd	ymm9,ymm9,ymm4
+	vpaddd	ymm12,ymm10,ymm5
+	vpcmpeqd	ymm10,ymm10,ymm4
+	vpaddd	ymm13,ymm11,ymm5
+	vpcmpeqd	ymm11,ymm11,ymm4
+	vpaddd	ymm14,ymm12,ymm5
+	vpcmpeqd	ymm12,ymm12,ymm4
+	vpaddd	ymm15,ymm13,ymm5
+	vpcmpeqd	ymm13,ymm13,ymm4
+	vpcmpeqd	ymm14,ymm14,ymm4
+	vpcmpeqd	ymm15,ymm15,ymm4
+
+	vmovdqa	ymm7,YMMWORD[((-32))+r10]
+	lea	rdx,[128+rdx]
+	mov	r8d,9
+
+$L$oop_gather_1024:
+	vmovdqa	ymm0,YMMWORD[((0-128))+rdx]
+	vmovdqa	ymm1,YMMWORD[((32-128))+rdx]
+	vmovdqa	ymm2,YMMWORD[((64-128))+rdx]
+	vmovdqa	ymm3,YMMWORD[((96-128))+rdx]
+	vpand	ymm0,ymm0,YMMWORD[((0+128))+rax]
+	vpand	ymm1,ymm1,YMMWORD[((32+128))+rax]
+	vpand	ymm2,ymm2,YMMWORD[((64+128))+rax]
+	vpor	ymm4,ymm1,ymm0
+	vpand	ymm3,ymm3,YMMWORD[((96+128))+rax]
+	vmovdqa	ymm0,YMMWORD[((128-128))+rdx]
+	vmovdqa	ymm1,YMMWORD[((160-128))+rdx]
+	vpor	ymm5,ymm3,ymm2
+	vmovdqa	ymm2,YMMWORD[((192-128))+rdx]
+	vmovdqa	ymm3,YMMWORD[((224-128))+rdx]
+	vpand	ymm0,ymm0,YMMWORD[((128+128))+rax]
+	vpand	ymm1,ymm1,YMMWORD[((160+128))+rax]
+	vpand	ymm2,ymm2,YMMWORD[((192+128))+rax]
+	vpor	ymm4,ymm4,ymm0
+	vpand	ymm3,ymm3,YMMWORD[((224+128))+rax]
+	vpand	ymm0,ymm8,YMMWORD[((256-128))+rdx]
+	vpor	ymm5,ymm5,ymm1
+	vpand	ymm1,ymm9,YMMWORD[((288-128))+rdx]
+	vpor	ymm4,ymm4,ymm2
+	vpand	ymm2,ymm10,YMMWORD[((320-128))+rdx]
+	vpor	ymm5,ymm5,ymm3
+	vpand	ymm3,ymm11,YMMWORD[((352-128))+rdx]
+	vpor	ymm4,ymm4,ymm0
+	vpand	ymm0,ymm12,YMMWORD[((384-128))+rdx]
+	vpor	ymm5,ymm5,ymm1
+	vpand	ymm1,ymm13,YMMWORD[((416-128))+rdx]
+	vpor	ymm4,ymm4,ymm2
+	vpand	ymm2,ymm14,YMMWORD[((448-128))+rdx]
+	vpor	ymm5,ymm5,ymm3
+	vpand	ymm3,ymm15,YMMWORD[((480-128))+rdx]
+	lea	rdx,[512+rdx]
+	vpor	ymm4,ymm4,ymm0
+	vpor	ymm5,ymm5,ymm1
+	vpor	ymm4,ymm4,ymm2
+	vpor	ymm5,ymm5,ymm3
+
+	vpor	ymm4,ymm4,ymm5
+	vextracti128	xmm5,ymm4,1
+	vpor	xmm5,xmm5,xmm4
+	vpermd	ymm5,ymm7,ymm5
+	vmovdqu	YMMWORD[rcx],ymm5
+	lea	rcx,[32+rcx]
+	dec	r8d
+	jnz	NEAR $L$oop_gather_1024
+
+	vpxor	ymm0,ymm0,ymm0
+	vmovdqu	YMMWORD[rcx],ymm0
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	lea	rsp,[r11]
+
+	ret
+
+$L$SEH_end_rsaz_1024_gather5:
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$and_mask:
+	DQ	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+$L$scatter_permd:
+	DD	0,2,4,6,7,7,7,7
+$L$gather_permd:
+	DD	0,7,1,7,2,7,3,7
+$L$inc:
+	DD	0,0,0,0,1,1,1,1
+	DD	2,2,2,2,3,3,3,3
+	DD	4,4,4,4,4,4,4,4
+ALIGN	64
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+rsaz_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rbp,QWORD[160+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	cmovc	rax,rbp
+
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	mov	QWORD[240+r8],r15
+	mov	QWORD[232+r8],r14
+	mov	QWORD[224+r8],r13
+	mov	QWORD[216+r8],r12
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[144+r8],rbx
+
+	lea	rsi,[((-216))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase
+	DD	$L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase
+	DD	$L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase
+
+	DD	$L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase
+	DD	$L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase
+	DD	$L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase
+
+	DD	$L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase
+	DD	$L$SEH_end_rsaz_1024_gather5 wrt ..imagebase
+	DD	$L$SEH_info_rsaz_1024_gather5 wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_rsaz_1024_sqr_avx2:
+	DB	9,0,0,0
+	DD	rsaz_se_handler wrt ..imagebase
+	DD	$L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase
+	DD	0
+$L$SEH_info_rsaz_1024_mul_avx2:
+	DB	9,0,0,0
+	DD	rsaz_se_handler wrt ..imagebase
+	DD	$L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase
+	DD	0
+$L$SEH_info_rsaz_1024_gather5:
+	DB	0x01,0x36,0x17,0x0b
+	DB	0x36,0xf8,0x09,0x00
+	DB	0x31,0xe8,0x08,0x00
+	DB	0x2c,0xd8,0x07,0x00
+	DB	0x27,0xc8,0x06,0x00
+	DB	0x22,0xb8,0x05,0x00
+	DB	0x1d,0xa8,0x04,0x00
+	DB	0x18,0x98,0x03,0x00
+	DB	0x13,0x88,0x02,0x00
+	DB	0x0e,0x78,0x01,0x00
+	DB	0x09,0x68,0x00,0x00
+	DB	0x04,0x01,0x15,0x00
+	DB	0x00,0xb3,0x00,0x00
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha1-586-apple.S b/gen/bcm/sha1-586-apple.S
new file mode 100644
index 0000000..f0ab02b
--- /dev/null
+++ b/gen/bcm/sha1-586-apple.S
@@ -0,0 +1,3782 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha1_block_data_order_nohw
+.private_extern	_sha1_block_data_order_nohw
+.align	4
+_sha1_block_data_order_nohw:
+L_sha1_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	L000loop
+.align	4,0x90
+L000loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+	# 00_15 0 
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 1 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 2 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 3 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+	# 00_15 4 
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+	# 00_15 5 
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+	# 00_15 6 
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 7 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 8 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 9 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+	# 00_15 10 
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+	# 00_15 11 
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+	# 00_15 12 
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 13 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 14 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 15 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+	# 16_19 16 
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+	# 16_19 17 
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+	# 16_19 18 
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+	# 16_19 19 
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 20 
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 21 
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 22 
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 23 
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 24 
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 25 
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 26 
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 27 
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 28 
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 29 
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 30 
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 31 
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 32 
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 33 
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 34 
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 35 
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 36 
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 37 
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 38 
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 39 
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 40 
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 41 
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 42 
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 43 
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 44 
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 45 
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 46 
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 47 
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 48 
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 49 
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 50 
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 51 
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 52 
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 53 
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 54 
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 55 
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 56 
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 57 
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 58 
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 59 
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 60 
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 61 
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 62 
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 63 
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 64 
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 65 
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 66 
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 67 
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 68 
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 69 
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 70 
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 71 
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 72 
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 73 
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 74 
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 75 
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 76 
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 77 
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 78 
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 79 
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	L000loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha1_block_data_order_ssse3
+.private_extern	_sha1_block_data_order_ssse3
+.align	4
+_sha1_block_data_order_ssse3:
+L_sha1_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	L001pic_point
+L001pic_point:
+	popl	%ebp
+	leal	LK_XX_XX-L001pic_point(%ebp),%ebp
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	L002loop
+.align	4,0x90
+L002loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
+	addl	(%esp),%edi
+	xorl	%ecx,%ebx
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	roll	$5,%eax
+	addl	%esi,%edi
+	psrldq	$4,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm6
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
+	roll	$5,%edx
+	movdqa	%xmm0,%xmm7
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	pxor	%xmm0,%xmm4
+	movdqa	96(%esp),%xmm0
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	pxor	%xmm7,%xmm4
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
+	movdqa	%xmm4,%xmm7
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm7
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
+	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pxor	%xmm1,%xmm5
+	movdqa	112(%esp),%xmm1
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	pxor	%xmm0,%xmm5
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
+	movdqa	%xmm5,%xmm0
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm0
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm1,16(%esp)
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm0
+	xorl	%edx,%esi
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm1
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
+	por	%xmm0,%xmm6
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	pxor	%xmm2,%xmm6
+	movdqa	112(%esp),%xmm2
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	pxor	%xmm1,%xmm6
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
+	movdqa	%xmm6,%xmm1
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm1
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	movdqa	%xmm2,32(%esp)
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm1
+	xorl	%edi,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm2
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm1,%xmm7
+	xorl	%edx,%ebp
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	pxor	%xmm3,%xmm7
+	movdqa	112(%esp),%xmm3
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
+	paddd	%xmm7,%xmm3
+	andl	%eax,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	psrld	$30,%xmm2
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	movdqa	96(%esp),%xmm2
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm4
+	addl	%eax,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
+	addl	%ecx,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm5
+	addl	%ebx,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
+	addl	%edx,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
+	addl	%edi,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
+	paddd	%xmm3,%xmm7
+	addl	%edx,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
+	addl	%eax,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm0
+	addl	%edi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	32(%esp),%edi
+	pxor	%xmm2,%xmm6
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
+	paddd	%xmm5,%xmm1
+	xorl	%ebx,%ebp
+	pxor	%xmm0,%xmm6
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	pslld	$2,%xmm6
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	por	%xmm0,%xmm6
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	pshufd	$238,%xmm5,%xmm1
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	48(%esp),%eax
+	pxor	%xmm3,%xmm7
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
+	paddd	%xmm6,%xmm2
+	xorl	%ecx,%ebp
+	pxor	%xmm1,%xmm7
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	pslld	$2,%xmm7
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	por	%xmm1,%xmm7
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	pshufd	$238,%xmm6,%xmm2
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	(%esp),%ebx
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
+	paddd	%xmm7,%xmm3
+	xorl	%edx,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pslld	$2,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	por	%xmm2,%xmm0
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	pshufd	$238,%xmm7,%xmm3
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	16(%esp),%ecx
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
+	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
+	paddd	%xmm0,%xmm4
+	xorl	%edi,%ebp
+	pxor	%xmm3,%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	pslld	$2,%xmm1
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	por	%xmm3,%xmm1
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pshufd	$238,%xmm0,%xmm4
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	32(%esp),%edx
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
+	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
+	paddd	%xmm1,%xmm5
+	xorl	%eax,%ebp
+	pxor	%xmm4,%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	pslld	$2,%xmm2
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	por	%xmm4,%xmm2
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	pshufd	$238,%xmm1,%xmm5
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm6
+	addl	%eax,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	L003done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%ecx,%ebx
+	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	L002loop
+.align	4,0x90
+L003done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha1_block_data_order_avx
+.private_extern	_sha1_block_data_order_avx
+.align	4
+_sha1_block_data_order_avx:
+L_sha1_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	L004pic_point
+L004pic_point:
+	popl	%ebp
+	leal	LK_XX_XX-L004pic_point(%ebp),%ebp
+	vzeroall
+	vmovdqa	(%ebp),%xmm7
+	vmovdqa	16(%ebp),%xmm0
+	vmovdqa	32(%ebp),%xmm1
+	vmovdqa	48(%ebp),%xmm2
+	vmovdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	vmovdqa	%xmm0,112(%esp)
+	vmovdqa	%xmm1,128(%esp)
+	vmovdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	vmovdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	vmovdqu	-64(%ebp),%xmm0
+	vmovdqu	-48(%ebp),%xmm1
+	vmovdqu	-32(%ebp),%xmm2
+	vmovdqu	-16(%ebp),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm7,96(%esp)
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm7,%xmm0,%xmm4
+	vpaddd	%xmm7,%xmm1,%xmm5
+	vpaddd	%xmm7,%xmm2,%xmm6
+	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
+	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
+	vmovdqa	%xmm6,32(%esp)
+	andl	%ebp,%esi
+	jmp	L005loop
+.align	4,0x90
+L005loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%ebp
+	addl	(%esp),%edi
+	vpaddd	%xmm3,%xmm7,%xmm7
+	vmovdqa	%xmm0,64(%esp)
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm7,48(%esp)
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm6
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm0
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm0,%xmm7
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm0,%xmm0
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vpxor	%xmm7,%xmm4,%xmm4
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vmovdqa	96(%esp),%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%ebp
+	addl	16(%esp),%eax
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqa	%xmm1,80(%esp)
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vmovdqa	%xmm0,(%esp)
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm1
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm0
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpxor	%xmm0,%xmm5,%xmm5
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	vmovdqa	112(%esp),%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%ebp
+	addl	32(%esp),%ebx
+	vpaddd	%xmm5,%xmm1,%xmm1
+	vmovdqa	%xmm2,96(%esp)
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm2
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm1
+	vpor	%xmm0,%xmm6,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	vmovdqa	64(%esp),%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vpxor	%xmm1,%xmm6,%xmm6
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vmovdqa	112(%esp),%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%ebp
+	addl	48(%esp),%ecx
+	vpaddd	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm3,64(%esp)
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm1
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm1,%xmm1
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm1
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$12,%xmm7,%xmm3
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm2
+	vpor	%xmm1,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	vmovdqa	80(%esp),%xmm1
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vpxor	%xmm2,%xmm7,%xmm7
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm2
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm3,%xmm1,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm3,%xmm1,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	vmovdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	vmovdqa	128(%esp),%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm4,%xmm2,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpor	%xmm4,%xmm2,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	vmovdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	vmovdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	vmovdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm7,%xmm0
+	vpaddd	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm6,%xmm4,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm6
+	vmovdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	vmovdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpxor	%xmm6,%xmm5,%xmm5
+	vmovdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpxor	%xmm7,%xmm5,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm7
+	vmovdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	28(%esp),%eax
+	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	vmovdqa	%xmm1,%xmm2
+	vpaddd	%xmm5,%xmm1,%xmm1
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	vpsrld	$30,%xmm6,%xmm0
+	vmovdqa	%xmm1,16(%esp)
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vmovdqa	96(%esp),%xmm0
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm1
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	vmovdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	vmovdqa	144(%esp),%xmm3
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	vpsrld	$30,%xmm7,%xmm1
+	vmovdqa	%xmm2,32(%esp)
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vmovdqa	64(%esp),%xmm1
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	80(%esp),%xmm2
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	vpor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm3
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	vmovdqa	%xmm5,%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	vpor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vmovdqa	64(%esp),%xmm4
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	vpaddd	%xmm3,%xmm7,%xmm7
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	L006done
+	vmovdqa	160(%esp),%xmm7
+	vmovdqa	176(%esp),%xmm6
+	vmovdqu	(%ebp),%xmm0
+	vmovdqu	16(%ebp),%xmm1
+	vmovdqu	32(%ebp),%xmm2
+	vmovdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+	vpshufb	%xmm6,%xmm0,%xmm0
+	movl	%ebp,196(%esp)
+	vmovdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm7,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm7,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpaddd	%xmm7,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vmovdqa	%xmm6,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
+	movl	%ecx,8(%ebp)
+	xorl	%edx,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	L005loop
+.align	4,0x90
+L006done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroall
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha1-586-linux.S b/gen/bcm/sha1-586-linux.S
new file mode 100644
index 0000000..0e5754f
--- /dev/null
+++ b/gen/bcm/sha1-586-linux.S
@@ -0,0 +1,3788 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	sha1_block_data_order_nohw
+.hidden	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,@function
+.align	16
+sha1_block_data_order_nohw:
+.L_sha1_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	.L000loop
+.align	16
+.L000loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	.L000loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order_nohw,.-.L_sha1_block_data_order_nohw_begin
+.globl	sha1_block_data_order_ssse3
+.hidden	sha1_block_data_order_ssse3
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+.L_sha1_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L001pic_point
+.L001pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L001pic_point(%ebp),%ebp
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	.L002loop
+.align	16
+.L002loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
+	addl	(%esp),%edi
+	xorl	%ecx,%ebx
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	roll	$5,%eax
+	addl	%esi,%edi
+	psrldq	$4,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm6
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
+	roll	$5,%edx
+	movdqa	%xmm0,%xmm7
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	pxor	%xmm0,%xmm4
+	movdqa	96(%esp),%xmm0
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	pxor	%xmm7,%xmm4
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
+	movdqa	%xmm4,%xmm7
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm7
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
+	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pxor	%xmm1,%xmm5
+	movdqa	112(%esp),%xmm1
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	pxor	%xmm0,%xmm5
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
+	movdqa	%xmm5,%xmm0
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm0
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm1,16(%esp)
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm0
+	xorl	%edx,%esi
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm1
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
+	por	%xmm0,%xmm6
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	pxor	%xmm2,%xmm6
+	movdqa	112(%esp),%xmm2
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	pxor	%xmm1,%xmm6
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
+	movdqa	%xmm6,%xmm1
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm1
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	movdqa	%xmm2,32(%esp)
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm1
+	xorl	%edi,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm2
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm1,%xmm7
+	xorl	%edx,%ebp
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	pxor	%xmm3,%xmm7
+	movdqa	112(%esp),%xmm3
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
+	paddd	%xmm7,%xmm3
+	andl	%eax,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	psrld	$30,%xmm2
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	movdqa	96(%esp),%xmm2
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm4
+	addl	%eax,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
+	addl	%ecx,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm5
+	addl	%ebx,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
+	addl	%edx,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
+	addl	%edi,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
+	paddd	%xmm3,%xmm7
+	addl	%edx,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
+	addl	%eax,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm0
+	addl	%edi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	32(%esp),%edi
+	pxor	%xmm2,%xmm6
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
+	paddd	%xmm5,%xmm1
+	xorl	%ebx,%ebp
+	pxor	%xmm0,%xmm6
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	pslld	$2,%xmm6
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	por	%xmm0,%xmm6
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	pshufd	$238,%xmm5,%xmm1
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	48(%esp),%eax
+	pxor	%xmm3,%xmm7
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
+	paddd	%xmm6,%xmm2
+	xorl	%ecx,%ebp
+	pxor	%xmm1,%xmm7
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	pslld	$2,%xmm7
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	por	%xmm1,%xmm7
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	pshufd	$238,%xmm6,%xmm2
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	(%esp),%ebx
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
+	paddd	%xmm7,%xmm3
+	xorl	%edx,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pslld	$2,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	por	%xmm2,%xmm0
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	pshufd	$238,%xmm7,%xmm3
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	16(%esp),%ecx
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
+	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
+	paddd	%xmm0,%xmm4
+	xorl	%edi,%ebp
+	pxor	%xmm3,%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	pslld	$2,%xmm1
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	por	%xmm3,%xmm1
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pshufd	$238,%xmm0,%xmm4
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	32(%esp),%edx
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
+	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
+	paddd	%xmm1,%xmm5
+	xorl	%eax,%ebp
+	pxor	%xmm4,%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	pslld	$2,%xmm2
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	por	%xmm4,%xmm2
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	pshufd	$238,%xmm1,%xmm5
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm6
+	addl	%eax,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L003done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%ecx,%ebx
+	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L002loop
+.align	16
+.L003done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order_ssse3,.-.L_sha1_block_data_order_ssse3_begin
+.globl	sha1_block_data_order_avx
+.hidden	sha1_block_data_order_avx
+.type	sha1_block_data_order_avx,@function
+.align	16
+sha1_block_data_order_avx:
+.L_sha1_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L004pic_point(%ebp),%ebp
+	vzeroall
+	vmovdqa	(%ebp),%xmm7
+	vmovdqa	16(%ebp),%xmm0
+	vmovdqa	32(%ebp),%xmm1
+	vmovdqa	48(%ebp),%xmm2
+	vmovdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	vmovdqa	%xmm0,112(%esp)
+	vmovdqa	%xmm1,128(%esp)
+	vmovdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	vmovdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	vmovdqu	-64(%ebp),%xmm0
+	vmovdqu	-48(%ebp),%xmm1
+	vmovdqu	-32(%ebp),%xmm2
+	vmovdqu	-16(%ebp),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm7,96(%esp)
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm7,%xmm0,%xmm4
+	vpaddd	%xmm7,%xmm1,%xmm5
+	vpaddd	%xmm7,%xmm2,%xmm6
+	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
+	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
+	vmovdqa	%xmm6,32(%esp)
+	andl	%ebp,%esi
+	jmp	.L005loop
+.align	16
+.L005loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%ebp
+	addl	(%esp),%edi
+	vpaddd	%xmm3,%xmm7,%xmm7
+	vmovdqa	%xmm0,64(%esp)
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm7,48(%esp)
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm6
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm0
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm0,%xmm7
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm0,%xmm0
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vpxor	%xmm7,%xmm4,%xmm4
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vmovdqa	96(%esp),%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%ebp
+	addl	16(%esp),%eax
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqa	%xmm1,80(%esp)
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vmovdqa	%xmm0,(%esp)
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm1
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm0
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpxor	%xmm0,%xmm5,%xmm5
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	vmovdqa	112(%esp),%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%ebp
+	addl	32(%esp),%ebx
+	vpaddd	%xmm5,%xmm1,%xmm1
+	vmovdqa	%xmm2,96(%esp)
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm2
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm1
+	vpor	%xmm0,%xmm6,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	vmovdqa	64(%esp),%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vpxor	%xmm1,%xmm6,%xmm6
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vmovdqa	112(%esp),%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%ebp
+	addl	48(%esp),%ecx
+	vpaddd	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm3,64(%esp)
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm1
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm1,%xmm1
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm1
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$12,%xmm7,%xmm3
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm2
+	vpor	%xmm1,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	vmovdqa	80(%esp),%xmm1
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vpxor	%xmm2,%xmm7,%xmm7
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm2
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm3,%xmm1,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm3,%xmm1,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	vmovdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	vmovdqa	128(%esp),%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm4,%xmm2,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpor	%xmm4,%xmm2,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	vmovdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	vmovdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	vmovdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm7,%xmm0
+	vpaddd	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm6,%xmm4,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm6
+	vmovdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	vmovdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpxor	%xmm6,%xmm5,%xmm5
+	vmovdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpxor	%xmm7,%xmm5,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm7
+	vmovdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	28(%esp),%eax
+	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	vmovdqa	%xmm1,%xmm2
+	vpaddd	%xmm5,%xmm1,%xmm1
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	vpsrld	$30,%xmm6,%xmm0
+	vmovdqa	%xmm1,16(%esp)
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vmovdqa	96(%esp),%xmm0
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm1
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	vmovdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	vmovdqa	144(%esp),%xmm3
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	vpsrld	$30,%xmm7,%xmm1
+	vmovdqa	%xmm2,32(%esp)
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vmovdqa	64(%esp),%xmm1
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	80(%esp),%xmm2
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	vpor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm3
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	vmovdqa	%xmm5,%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	vpor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vmovdqa	64(%esp),%xmm4
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	vpaddd	%xmm3,%xmm7,%xmm7
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L006done
+	vmovdqa	160(%esp),%xmm7
+	vmovdqa	176(%esp),%xmm6
+	vmovdqu	(%ebp),%xmm0
+	vmovdqu	16(%ebp),%xmm1
+	vmovdqu	32(%ebp),%xmm2
+	vmovdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+	vpshufb	%xmm6,%xmm0,%xmm0
+	movl	%ebp,196(%esp)
+	vmovdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm7,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm7,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpaddd	%xmm7,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vmovdqa	%xmm6,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
+	movl	%ecx,8(%ebp)
+	xorl	%edx,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L005loop
+.align	16
+.L006done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroall
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order_avx,.-.L_sha1_block_data_order_avx_begin
+.align	64
+.LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha1-586-win.asm b/gen/bcm/sha1-586-win.asm
new file mode 100644
index 0000000..c8823a9
--- /dev/null
+++ b/gen/bcm/sha1-586-win.asm
@@ -0,0 +1,3790 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_sha1_block_data_order_nohw
+align	16
+_sha1_block_data_order_nohw:
+L$_sha1_block_data_order_nohw_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	sub	esp,76
+	shl	eax,6
+	add	eax,esi
+	mov	DWORD [104+esp],eax
+	mov	edi,DWORD [16+ebp]
+	jmp	NEAR L$000loop
+align	16
+L$000loop:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	mov	DWORD [16+esp],eax
+	mov	DWORD [20+esp],ebx
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],edx
+	mov	eax,DWORD [32+esi]
+	mov	ebx,DWORD [36+esi]
+	mov	ecx,DWORD [40+esi]
+	mov	edx,DWORD [44+esi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	mov	DWORD [32+esp],eax
+	mov	DWORD [36+esp],ebx
+	mov	DWORD [40+esp],ecx
+	mov	DWORD [44+esp],edx
+	mov	eax,DWORD [48+esi]
+	mov	ebx,DWORD [52+esi]
+	mov	ecx,DWORD [56+esi]
+	mov	edx,DWORD [60+esi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	mov	DWORD [48+esp],eax
+	mov	DWORD [52+esp],ebx
+	mov	DWORD [56+esp],ecx
+	mov	DWORD [60+esp],edx
+	mov	DWORD [100+esp],esi
+	mov	eax,DWORD [ebp]
+	mov	ebx,DWORD [4+ebp]
+	mov	ecx,DWORD [8+ebp]
+	mov	edx,DWORD [12+ebp]
+	; 00_15 0
+	mov	esi,ecx
+	mov	ebp,eax
+	rol	ebp,5
+	xor	esi,edx
+	add	ebp,edi
+	mov	edi,DWORD [esp]
+	and	esi,ebx
+	ror	ebx,2
+	xor	esi,edx
+	lea	ebp,[1518500249+edi*1+ebp]
+	add	ebp,esi
+	; 00_15 1
+	mov	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	xor	edi,ecx
+	add	ebp,edx
+	mov	edx,DWORD [4+esp]
+	and	edi,eax
+	ror	eax,2
+	xor	edi,ecx
+	lea	ebp,[1518500249+edx*1+ebp]
+	add	ebp,edi
+	; 00_15 2
+	mov	edx,eax
+	mov	edi,ebp
+	rol	ebp,5
+	xor	edx,ebx
+	add	ebp,ecx
+	mov	ecx,DWORD [8+esp]
+	and	edx,esi
+	ror	esi,2
+	xor	edx,ebx
+	lea	ebp,[1518500249+ecx*1+ebp]
+	add	ebp,edx
+	; 00_15 3
+	mov	ecx,esi
+	mov	edx,ebp
+	rol	ebp,5
+	xor	ecx,eax
+	add	ebp,ebx
+	mov	ebx,DWORD [12+esp]
+	and	ecx,edi
+	ror	edi,2
+	xor	ecx,eax
+	lea	ebp,[1518500249+ebx*1+ebp]
+	add	ebp,ecx
+	; 00_15 4
+	mov	ebx,edi
+	mov	ecx,ebp
+	rol	ebp,5
+	xor	ebx,esi
+	add	ebp,eax
+	mov	eax,DWORD [16+esp]
+	and	ebx,edx
+	ror	edx,2
+	xor	ebx,esi
+	lea	ebp,[1518500249+eax*1+ebp]
+	add	ebp,ebx
+	; 00_15 5
+	mov	eax,edx
+	mov	ebx,ebp
+	rol	ebp,5
+	xor	eax,edi
+	add	ebp,esi
+	mov	esi,DWORD [20+esp]
+	and	eax,ecx
+	ror	ecx,2
+	xor	eax,edi
+	lea	ebp,[1518500249+esi*1+ebp]
+	add	ebp,eax
+	; 00_15 6
+	mov	esi,ecx
+	mov	eax,ebp
+	rol	ebp,5
+	xor	esi,edx
+	add	ebp,edi
+	mov	edi,DWORD [24+esp]
+	and	esi,ebx
+	ror	ebx,2
+	xor	esi,edx
+	lea	ebp,[1518500249+edi*1+ebp]
+	add	ebp,esi
+	; 00_15 7
+	mov	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	xor	edi,ecx
+	add	ebp,edx
+	mov	edx,DWORD [28+esp]
+	and	edi,eax
+	ror	eax,2
+	xor	edi,ecx
+	lea	ebp,[1518500249+edx*1+ebp]
+	add	ebp,edi
+	; 00_15 8
+	mov	edx,eax
+	mov	edi,ebp
+	rol	ebp,5
+	xor	edx,ebx
+	add	ebp,ecx
+	mov	ecx,DWORD [32+esp]
+	and	edx,esi
+	ror	esi,2
+	xor	edx,ebx
+	lea	ebp,[1518500249+ecx*1+ebp]
+	add	ebp,edx
+	; 00_15 9
+	mov	ecx,esi
+	mov	edx,ebp
+	rol	ebp,5
+	xor	ecx,eax
+	add	ebp,ebx
+	mov	ebx,DWORD [36+esp]
+	and	ecx,edi
+	ror	edi,2
+	xor	ecx,eax
+	lea	ebp,[1518500249+ebx*1+ebp]
+	add	ebp,ecx
+	; 00_15 10
+	mov	ebx,edi
+	mov	ecx,ebp
+	rol	ebp,5
+	xor	ebx,esi
+	add	ebp,eax
+	mov	eax,DWORD [40+esp]
+	and	ebx,edx
+	ror	edx,2
+	xor	ebx,esi
+	lea	ebp,[1518500249+eax*1+ebp]
+	add	ebp,ebx
+	; 00_15 11
+	mov	eax,edx
+	mov	ebx,ebp
+	rol	ebp,5
+	xor	eax,edi
+	add	ebp,esi
+	mov	esi,DWORD [44+esp]
+	and	eax,ecx
+	ror	ecx,2
+	xor	eax,edi
+	lea	ebp,[1518500249+esi*1+ebp]
+	add	ebp,eax
+	; 00_15 12
+	mov	esi,ecx
+	mov	eax,ebp
+	rol	ebp,5
+	xor	esi,edx
+	add	ebp,edi
+	mov	edi,DWORD [48+esp]
+	and	esi,ebx
+	ror	ebx,2
+	xor	esi,edx
+	lea	ebp,[1518500249+edi*1+ebp]
+	add	ebp,esi
+	; 00_15 13
+	mov	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	xor	edi,ecx
+	add	ebp,edx
+	mov	edx,DWORD [52+esp]
+	and	edi,eax
+	ror	eax,2
+	xor	edi,ecx
+	lea	ebp,[1518500249+edx*1+ebp]
+	add	ebp,edi
+	; 00_15 14
+	mov	edx,eax
+	mov	edi,ebp
+	rol	ebp,5
+	xor	edx,ebx
+	add	ebp,ecx
+	mov	ecx,DWORD [56+esp]
+	and	edx,esi
+	ror	esi,2
+	xor	edx,ebx
+	lea	ebp,[1518500249+ecx*1+ebp]
+	add	ebp,edx
+	; 00_15 15
+	mov	ecx,esi
+	mov	edx,ebp
+	rol	ebp,5
+	xor	ecx,eax
+	add	ebp,ebx
+	mov	ebx,DWORD [60+esp]
+	and	ecx,edi
+	ror	edi,2
+	xor	ecx,eax
+	lea	ebp,[1518500249+ebx*1+ebp]
+	mov	ebx,DWORD [esp]
+	add	ecx,ebp
+	; 16_19 16
+	mov	ebp,edi
+	xor	ebx,DWORD [8+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [32+esp]
+	and	ebp,edx
+	xor	ebx,DWORD [52+esp]
+	rol	ebx,1
+	xor	ebp,esi
+	add	eax,ebp
+	mov	ebp,ecx
+	ror	edx,2
+	mov	DWORD [esp],ebx
+	rol	ebp,5
+	lea	ebx,[1518500249+eax*1+ebx]
+	mov	eax,DWORD [4+esp]
+	add	ebx,ebp
+	; 16_19 17
+	mov	ebp,edx
+	xor	eax,DWORD [12+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [36+esp]
+	and	ebp,ecx
+	xor	eax,DWORD [56+esp]
+	rol	eax,1
+	xor	ebp,edi
+	add	esi,ebp
+	mov	ebp,ebx
+	ror	ecx,2
+	mov	DWORD [4+esp],eax
+	rol	ebp,5
+	lea	eax,[1518500249+esi*1+eax]
+	mov	esi,DWORD [8+esp]
+	add	eax,ebp
+	; 16_19 18
+	mov	ebp,ecx
+	xor	esi,DWORD [16+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [40+esp]
+	and	ebp,ebx
+	xor	esi,DWORD [60+esp]
+	rol	esi,1
+	xor	ebp,edx
+	add	edi,ebp
+	mov	ebp,eax
+	ror	ebx,2
+	mov	DWORD [8+esp],esi
+	rol	ebp,5
+	lea	esi,[1518500249+edi*1+esi]
+	mov	edi,DWORD [12+esp]
+	add	esi,ebp
+	; 16_19 19
+	mov	ebp,ebx
+	xor	edi,DWORD [20+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [44+esp]
+	and	ebp,eax
+	xor	edi,DWORD [esp]
+	rol	edi,1
+	xor	ebp,ecx
+	add	edx,ebp
+	mov	ebp,esi
+	ror	eax,2
+	mov	DWORD [12+esp],edi
+	rol	ebp,5
+	lea	edi,[1518500249+edx*1+edi]
+	mov	edx,DWORD [16+esp]
+	add	edi,ebp
+	; 20_39 20
+	mov	ebp,esi
+	xor	edx,DWORD [24+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [48+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [4+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [16+esp],edx
+	lea	edx,[1859775393+ecx*1+edx]
+	mov	ecx,DWORD [20+esp]
+	add	edx,ebp
+	; 20_39 21
+	mov	ebp,edi
+	xor	ecx,DWORD [28+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [8+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [20+esp],ecx
+	lea	ecx,[1859775393+ebx*1+ecx]
+	mov	ebx,DWORD [24+esp]
+	add	ecx,ebp
+	; 20_39 22
+	mov	ebp,edx
+	xor	ebx,DWORD [32+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [56+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [12+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [24+esp],ebx
+	lea	ebx,[1859775393+eax*1+ebx]
+	mov	eax,DWORD [28+esp]
+	add	ebx,ebp
+	; 20_39 23
+	mov	ebp,ecx
+	xor	eax,DWORD [36+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [60+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [16+esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	mov	DWORD [28+esp],eax
+	lea	eax,[1859775393+esi*1+eax]
+	mov	esi,DWORD [32+esp]
+	add	eax,ebp
+	; 20_39 24
+	mov	ebp,ebx
+	xor	esi,DWORD [40+esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [esp]
+	xor	ebp,edx
+	xor	esi,DWORD [20+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [32+esp],esi
+	lea	esi,[1859775393+edi*1+esi]
+	mov	edi,DWORD [36+esp]
+	add	esi,ebp
+	; 20_39 25
+	mov	ebp,eax
+	xor	edi,DWORD [44+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [4+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [24+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [36+esp],edi
+	lea	edi,[1859775393+edx*1+edi]
+	mov	edx,DWORD [40+esp]
+	add	edi,ebp
+	; 20_39 26
+	mov	ebp,esi
+	xor	edx,DWORD [48+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [8+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [28+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [40+esp],edx
+	lea	edx,[1859775393+ecx*1+edx]
+	mov	ecx,DWORD [44+esp]
+	add	edx,ebp
+	; 20_39 27
+	mov	ebp,edi
+	xor	ecx,DWORD [52+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [12+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [32+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [44+esp],ecx
+	lea	ecx,[1859775393+ebx*1+ecx]
+	mov	ebx,DWORD [48+esp]
+	add	ecx,ebp
+	; 20_39 28
+	mov	ebp,edx
+	xor	ebx,DWORD [56+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [16+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [36+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [48+esp],ebx
+	lea	ebx,[1859775393+eax*1+ebx]
+	mov	eax,DWORD [52+esp]
+	add	ebx,ebp
+	; 20_39 29
+	mov	ebp,ecx
+	xor	eax,DWORD [60+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [20+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [40+esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	mov	DWORD [52+esp],eax
+	lea	eax,[1859775393+esi*1+eax]
+	mov	esi,DWORD [56+esp]
+	add	eax,ebp
+	; 20_39 30
+	mov	ebp,ebx
+	xor	esi,DWORD [esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [24+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [44+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [56+esp],esi
+	lea	esi,[1859775393+edi*1+esi]
+	mov	edi,DWORD [60+esp]
+	add	esi,ebp
+	; 20_39 31
+	mov	ebp,eax
+	xor	edi,DWORD [4+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [28+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [48+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [60+esp],edi
+	lea	edi,[1859775393+edx*1+edi]
+	mov	edx,DWORD [esp]
+	add	edi,ebp
+	; 20_39 32
+	mov	ebp,esi
+	xor	edx,DWORD [8+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [32+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [52+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [esp],edx
+	lea	edx,[1859775393+ecx*1+edx]
+	mov	ecx,DWORD [4+esp]
+	add	edx,ebp
+	; 20_39 33
+	mov	ebp,edi
+	xor	ecx,DWORD [12+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [36+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [56+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [4+esp],ecx
+	lea	ecx,[1859775393+ebx*1+ecx]
+	mov	ebx,DWORD [8+esp]
+	add	ecx,ebp
+	; 20_39 34
+	mov	ebp,edx
+	xor	ebx,DWORD [16+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [40+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [60+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [8+esp],ebx
+	lea	ebx,[1859775393+eax*1+ebx]
+	mov	eax,DWORD [12+esp]
+	add	ebx,ebp
+	; 20_39 35
+	mov	ebp,ecx
+	xor	eax,DWORD [20+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [44+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	mov	DWORD [12+esp],eax
+	lea	eax,[1859775393+esi*1+eax]
+	mov	esi,DWORD [16+esp]
+	add	eax,ebp
+	; 20_39 36
+	mov	ebp,ebx
+	xor	esi,DWORD [24+esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [48+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [4+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [16+esp],esi
+	lea	esi,[1859775393+edi*1+esi]
+	mov	edi,DWORD [20+esp]
+	add	esi,ebp
+	; 20_39 37
+	mov	ebp,eax
+	xor	edi,DWORD [28+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [52+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [8+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [20+esp],edi
+	lea	edi,[1859775393+edx*1+edi]
+	mov	edx,DWORD [24+esp]
+	add	edi,ebp
+	; 20_39 38
+	mov	ebp,esi
+	xor	edx,DWORD [32+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [56+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [12+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [24+esp],edx
+	lea	edx,[1859775393+ecx*1+edx]
+	mov	ecx,DWORD [28+esp]
+	add	edx,ebp
+	; 20_39 39
+	mov	ebp,edi
+	xor	ecx,DWORD [36+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [60+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [16+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [28+esp],ecx
+	lea	ecx,[1859775393+ebx*1+ecx]
+	mov	ebx,DWORD [32+esp]
+	add	ecx,ebp
+	; 40_59 40
+	mov	ebp,edi
+	xor	ebx,DWORD [40+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [esp]
+	and	ebp,edx
+	xor	ebx,DWORD [20+esp]
+	rol	ebx,1
+	add	ebp,eax
+	ror	edx,2
+	mov	eax,ecx
+	rol	eax,5
+	mov	DWORD [32+esp],ebx
+	lea	ebx,[2400959708+ebp*1+ebx]
+	mov	ebp,edi
+	add	ebx,eax
+	and	ebp,esi
+	mov	eax,DWORD [36+esp]
+	add	ebx,ebp
+	; 40_59 41
+	mov	ebp,edx
+	xor	eax,DWORD [44+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [4+esp]
+	and	ebp,ecx
+	xor	eax,DWORD [24+esp]
+	rol	eax,1
+	add	ebp,esi
+	ror	ecx,2
+	mov	esi,ebx
+	rol	esi,5
+	mov	DWORD [36+esp],eax
+	lea	eax,[2400959708+ebp*1+eax]
+	mov	ebp,edx
+	add	eax,esi
+	and	ebp,edi
+	mov	esi,DWORD [40+esp]
+	add	eax,ebp
+	; 40_59 42
+	mov	ebp,ecx
+	xor	esi,DWORD [48+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [8+esp]
+	and	ebp,ebx
+	xor	esi,DWORD [28+esp]
+	rol	esi,1
+	add	ebp,edi
+	ror	ebx,2
+	mov	edi,eax
+	rol	edi,5
+	mov	DWORD [40+esp],esi
+	lea	esi,[2400959708+ebp*1+esi]
+	mov	ebp,ecx
+	add	esi,edi
+	and	ebp,edx
+	mov	edi,DWORD [44+esp]
+	add	esi,ebp
+	; 40_59 43
+	mov	ebp,ebx
+	xor	edi,DWORD [52+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [12+esp]
+	and	ebp,eax
+	xor	edi,DWORD [32+esp]
+	rol	edi,1
+	add	ebp,edx
+	ror	eax,2
+	mov	edx,esi
+	rol	edx,5
+	mov	DWORD [44+esp],edi
+	lea	edi,[2400959708+ebp*1+edi]
+	mov	ebp,ebx
+	add	edi,edx
+	and	ebp,ecx
+	mov	edx,DWORD [48+esp]
+	add	edi,ebp
+	; 40_59 44
+	mov	ebp,eax
+	xor	edx,DWORD [56+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [16+esp]
+	and	ebp,esi
+	xor	edx,DWORD [36+esp]
+	rol	edx,1
+	add	ebp,ecx
+	ror	esi,2
+	mov	ecx,edi
+	rol	ecx,5
+	mov	DWORD [48+esp],edx
+	lea	edx,[2400959708+ebp*1+edx]
+	mov	ebp,eax
+	add	edx,ecx
+	and	ebp,ebx
+	mov	ecx,DWORD [52+esp]
+	add	edx,ebp
+	; 40_59 45
+	mov	ebp,esi
+	xor	ecx,DWORD [60+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [20+esp]
+	and	ebp,edi
+	xor	ecx,DWORD [40+esp]
+	rol	ecx,1
+	add	ebp,ebx
+	ror	edi,2
+	mov	ebx,edx
+	rol	ebx,5
+	mov	DWORD [52+esp],ecx
+	lea	ecx,[2400959708+ebp*1+ecx]
+	mov	ebp,esi
+	add	ecx,ebx
+	and	ebp,eax
+	mov	ebx,DWORD [56+esp]
+	add	ecx,ebp
+	; 40_59 46
+	mov	ebp,edi
+	xor	ebx,DWORD [esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [24+esp]
+	and	ebp,edx
+	xor	ebx,DWORD [44+esp]
+	rol	ebx,1
+	add	ebp,eax
+	ror	edx,2
+	mov	eax,ecx
+	rol	eax,5
+	mov	DWORD [56+esp],ebx
+	lea	ebx,[2400959708+ebp*1+ebx]
+	mov	ebp,edi
+	add	ebx,eax
+	and	ebp,esi
+	mov	eax,DWORD [60+esp]
+	add	ebx,ebp
+	; 40_59 47
+	mov	ebp,edx
+	xor	eax,DWORD [4+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [28+esp]
+	and	ebp,ecx
+	xor	eax,DWORD [48+esp]
+	rol	eax,1
+	add	ebp,esi
+	ror	ecx,2
+	mov	esi,ebx
+	rol	esi,5
+	mov	DWORD [60+esp],eax
+	lea	eax,[2400959708+ebp*1+eax]
+	mov	ebp,edx
+	add	eax,esi
+	and	ebp,edi
+	mov	esi,DWORD [esp]
+	add	eax,ebp
+	; 40_59 48
+	mov	ebp,ecx
+	xor	esi,DWORD [8+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [32+esp]
+	and	ebp,ebx
+	xor	esi,DWORD [52+esp]
+	rol	esi,1
+	add	ebp,edi
+	ror	ebx,2
+	mov	edi,eax
+	rol	edi,5
+	mov	DWORD [esp],esi
+	lea	esi,[2400959708+ebp*1+esi]
+	mov	ebp,ecx
+	add	esi,edi
+	and	ebp,edx
+	mov	edi,DWORD [4+esp]
+	add	esi,ebp
+	; 40_59 49
+	mov	ebp,ebx
+	xor	edi,DWORD [12+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [36+esp]
+	and	ebp,eax
+	xor	edi,DWORD [56+esp]
+	rol	edi,1
+	add	ebp,edx
+	ror	eax,2
+	mov	edx,esi
+	rol	edx,5
+	mov	DWORD [4+esp],edi
+	lea	edi,[2400959708+ebp*1+edi]
+	mov	ebp,ebx
+	add	edi,edx
+	and	ebp,ecx
+	mov	edx,DWORD [8+esp]
+	add	edi,ebp
+	; 40_59 50
+	mov	ebp,eax
+	xor	edx,DWORD [16+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [40+esp]
+	and	ebp,esi
+	xor	edx,DWORD [60+esp]
+	rol	edx,1
+	add	ebp,ecx
+	ror	esi,2
+	mov	ecx,edi
+	rol	ecx,5
+	mov	DWORD [8+esp],edx
+	lea	edx,[2400959708+ebp*1+edx]
+	mov	ebp,eax
+	add	edx,ecx
+	and	ebp,ebx
+	mov	ecx,DWORD [12+esp]
+	add	edx,ebp
+	; 40_59 51
+	mov	ebp,esi
+	xor	ecx,DWORD [20+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [44+esp]
+	and	ebp,edi
+	xor	ecx,DWORD [esp]
+	rol	ecx,1
+	add	ebp,ebx
+	ror	edi,2
+	mov	ebx,edx
+	rol	ebx,5
+	mov	DWORD [12+esp],ecx
+	lea	ecx,[2400959708+ebp*1+ecx]
+	mov	ebp,esi
+	add	ecx,ebx
+	and	ebp,eax
+	mov	ebx,DWORD [16+esp]
+	add	ecx,ebp
+	; 40_59 52
+	mov	ebp,edi
+	xor	ebx,DWORD [24+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [48+esp]
+	and	ebp,edx
+	xor	ebx,DWORD [4+esp]
+	rol	ebx,1
+	add	ebp,eax
+	ror	edx,2
+	mov	eax,ecx
+	rol	eax,5
+	mov	DWORD [16+esp],ebx
+	lea	ebx,[2400959708+ebp*1+ebx]
+	mov	ebp,edi
+	add	ebx,eax
+	and	ebp,esi
+	mov	eax,DWORD [20+esp]
+	add	ebx,ebp
+	; 40_59 53
+	mov	ebp,edx
+	xor	eax,DWORD [28+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [52+esp]
+	and	ebp,ecx
+	xor	eax,DWORD [8+esp]
+	rol	eax,1
+	add	ebp,esi
+	ror	ecx,2
+	mov	esi,ebx
+	rol	esi,5
+	mov	DWORD [20+esp],eax
+	lea	eax,[2400959708+ebp*1+eax]
+	mov	ebp,edx
+	add	eax,esi
+	and	ebp,edi
+	mov	esi,DWORD [24+esp]
+	add	eax,ebp
+	; 40_59 54
+	mov	ebp,ecx
+	xor	esi,DWORD [32+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [56+esp]
+	and	ebp,ebx
+	xor	esi,DWORD [12+esp]
+	rol	esi,1
+	add	ebp,edi
+	ror	ebx,2
+	mov	edi,eax
+	rol	edi,5
+	mov	DWORD [24+esp],esi
+	lea	esi,[2400959708+ebp*1+esi]
+	mov	ebp,ecx
+	add	esi,edi
+	and	ebp,edx
+	mov	edi,DWORD [28+esp]
+	add	esi,ebp
+	; 40_59 55
+	mov	ebp,ebx
+	xor	edi,DWORD [36+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [60+esp]
+	and	ebp,eax
+	xor	edi,DWORD [16+esp]
+	rol	edi,1
+	add	ebp,edx
+	ror	eax,2
+	mov	edx,esi
+	rol	edx,5
+	mov	DWORD [28+esp],edi
+	lea	edi,[2400959708+ebp*1+edi]
+	mov	ebp,ebx
+	add	edi,edx
+	and	ebp,ecx
+	mov	edx,DWORD [32+esp]
+	add	edi,ebp
+	; 40_59 56
+	mov	ebp,eax
+	xor	edx,DWORD [40+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [esp]
+	and	ebp,esi
+	xor	edx,DWORD [20+esp]
+	rol	edx,1
+	add	ebp,ecx
+	ror	esi,2
+	mov	ecx,edi
+	rol	ecx,5
+	mov	DWORD [32+esp],edx
+	lea	edx,[2400959708+ebp*1+edx]
+	mov	ebp,eax
+	add	edx,ecx
+	and	ebp,ebx
+	mov	ecx,DWORD [36+esp]
+	add	edx,ebp
+	; 40_59 57
+	mov	ebp,esi
+	xor	ecx,DWORD [44+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [4+esp]
+	and	ebp,edi
+	xor	ecx,DWORD [24+esp]
+	rol	ecx,1
+	add	ebp,ebx
+	ror	edi,2
+	mov	ebx,edx
+	rol	ebx,5
+	mov	DWORD [36+esp],ecx
+	lea	ecx,[2400959708+ebp*1+ecx]
+	mov	ebp,esi
+	add	ecx,ebx
+	and	ebp,eax
+	mov	ebx,DWORD [40+esp]
+	add	ecx,ebp
+	; 40_59 58
+	mov	ebp,edi
+	xor	ebx,DWORD [48+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [8+esp]
+	and	ebp,edx
+	xor	ebx,DWORD [28+esp]
+	rol	ebx,1
+	add	ebp,eax
+	ror	edx,2
+	mov	eax,ecx
+	rol	eax,5
+	mov	DWORD [40+esp],ebx
+	lea	ebx,[2400959708+ebp*1+ebx]
+	mov	ebp,edi
+	add	ebx,eax
+	and	ebp,esi
+	mov	eax,DWORD [44+esp]
+	add	ebx,ebp
+	; 40_59 59
+	mov	ebp,edx
+	xor	eax,DWORD [52+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [12+esp]
+	and	ebp,ecx
+	xor	eax,DWORD [32+esp]
+	rol	eax,1
+	add	ebp,esi
+	ror	ecx,2
+	mov	esi,ebx
+	rol	esi,5
+	mov	DWORD [44+esp],eax
+	lea	eax,[2400959708+ebp*1+eax]
+	mov	ebp,edx
+	add	eax,esi
+	and	ebp,edi
+	mov	esi,DWORD [48+esp]
+	add	eax,ebp
+	; 20_39 60
+	mov	ebp,ebx
+	xor	esi,DWORD [56+esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [16+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [36+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [48+esp],esi
+	lea	esi,[3395469782+edi*1+esi]
+	mov	edi,DWORD [52+esp]
+	add	esi,ebp
+	; 20_39 61
+	mov	ebp,eax
+	xor	edi,DWORD [60+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [20+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [40+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [52+esp],edi
+	lea	edi,[3395469782+edx*1+edi]
+	mov	edx,DWORD [56+esp]
+	add	edi,ebp
+	; 20_39 62
+	mov	ebp,esi
+	xor	edx,DWORD [esp]
+	xor	ebp,eax
+	xor	edx,DWORD [24+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [44+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [56+esp],edx
+	lea	edx,[3395469782+ecx*1+edx]
+	mov	ecx,DWORD [60+esp]
+	add	edx,ebp
+	; 20_39 63
+	mov	ebp,edi
+	xor	ecx,DWORD [4+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [28+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [48+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [60+esp],ecx
+	lea	ecx,[3395469782+ebx*1+ecx]
+	mov	ebx,DWORD [esp]
+	add	ecx,ebp
+	; 20_39 64
+	mov	ebp,edx
+	xor	ebx,DWORD [8+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [32+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [52+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [esp],ebx
+	lea	ebx,[3395469782+eax*1+ebx]
+	mov	eax,DWORD [4+esp]
+	add	ebx,ebp
+	; 20_39 65
+	mov	ebp,ecx
+	xor	eax,DWORD [12+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [36+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [56+esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	mov	DWORD [4+esp],eax
+	lea	eax,[3395469782+esi*1+eax]
+	mov	esi,DWORD [8+esp]
+	add	eax,ebp
+	; 20_39 66
+	mov	ebp,ebx
+	xor	esi,DWORD [16+esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [40+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [60+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [8+esp],esi
+	lea	esi,[3395469782+edi*1+esi]
+	mov	edi,DWORD [12+esp]
+	add	esi,ebp
+	; 20_39 67
+	mov	ebp,eax
+	xor	edi,DWORD [20+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [44+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [12+esp],edi
+	lea	edi,[3395469782+edx*1+edi]
+	mov	edx,DWORD [16+esp]
+	add	edi,ebp
+	; 20_39 68
+	mov	ebp,esi
+	xor	edx,DWORD [24+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [48+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [4+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [16+esp],edx
+	lea	edx,[3395469782+ecx*1+edx]
+	mov	ecx,DWORD [20+esp]
+	add	edx,ebp
+	; 20_39 69
+	mov	ebp,edi
+	xor	ecx,DWORD [28+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [8+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [20+esp],ecx
+	lea	ecx,[3395469782+ebx*1+ecx]
+	mov	ebx,DWORD [24+esp]
+	add	ecx,ebp
+	; 20_39 70
+	mov	ebp,edx
+	xor	ebx,DWORD [32+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [56+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [12+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [24+esp],ebx
+	lea	ebx,[3395469782+eax*1+ebx]
+	mov	eax,DWORD [28+esp]
+	add	ebx,ebp
+	; 20_39 71
+	mov	ebp,ecx
+	xor	eax,DWORD [36+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [60+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [16+esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	mov	DWORD [28+esp],eax
+	lea	eax,[3395469782+esi*1+eax]
+	mov	esi,DWORD [32+esp]
+	add	eax,ebp
+	; 20_39 72
+	mov	ebp,ebx
+	xor	esi,DWORD [40+esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [esp]
+	xor	ebp,edx
+	xor	esi,DWORD [20+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	mov	DWORD [32+esp],esi
+	lea	esi,[3395469782+edi*1+esi]
+	mov	edi,DWORD [36+esp]
+	add	esi,ebp
+	; 20_39 73
+	mov	ebp,eax
+	xor	edi,DWORD [44+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [4+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [24+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	mov	DWORD [36+esp],edi
+	lea	edi,[3395469782+edx*1+edi]
+	mov	edx,DWORD [40+esp]
+	add	edi,ebp
+	; 20_39 74
+	mov	ebp,esi
+	xor	edx,DWORD [48+esp]
+	xor	ebp,eax
+	xor	edx,DWORD [8+esp]
+	xor	ebp,ebx
+	xor	edx,DWORD [28+esp]
+	rol	edx,1
+	add	ecx,ebp
+	ror	esi,2
+	mov	ebp,edi
+	rol	ebp,5
+	mov	DWORD [40+esp],edx
+	lea	edx,[3395469782+ecx*1+edx]
+	mov	ecx,DWORD [44+esp]
+	add	edx,ebp
+	; 20_39 75
+	mov	ebp,edi
+	xor	ecx,DWORD [52+esp]
+	xor	ebp,esi
+	xor	ecx,DWORD [12+esp]
+	xor	ebp,eax
+	xor	ecx,DWORD [32+esp]
+	rol	ecx,1
+	add	ebx,ebp
+	ror	edi,2
+	mov	ebp,edx
+	rol	ebp,5
+	mov	DWORD [44+esp],ecx
+	lea	ecx,[3395469782+ebx*1+ecx]
+	mov	ebx,DWORD [48+esp]
+	add	ecx,ebp
+	; 20_39 76
+	mov	ebp,edx
+	xor	ebx,DWORD [56+esp]
+	xor	ebp,edi
+	xor	ebx,DWORD [16+esp]
+	xor	ebp,esi
+	xor	ebx,DWORD [36+esp]
+	rol	ebx,1
+	add	eax,ebp
+	ror	edx,2
+	mov	ebp,ecx
+	rol	ebp,5
+	mov	DWORD [48+esp],ebx
+	lea	ebx,[3395469782+eax*1+ebx]
+	mov	eax,DWORD [52+esp]
+	add	ebx,ebp
+	; 20_39 77
+	mov	ebp,ecx
+	xor	eax,DWORD [60+esp]
+	xor	ebp,edx
+	xor	eax,DWORD [20+esp]
+	xor	ebp,edi
+	xor	eax,DWORD [40+esp]
+	rol	eax,1
+	add	esi,ebp
+	ror	ecx,2
+	mov	ebp,ebx
+	rol	ebp,5
+	lea	eax,[3395469782+esi*1+eax]
+	mov	esi,DWORD [56+esp]
+	add	eax,ebp
+	; 20_39 78
+	mov	ebp,ebx
+	xor	esi,DWORD [esp]
+	xor	ebp,ecx
+	xor	esi,DWORD [24+esp]
+	xor	ebp,edx
+	xor	esi,DWORD [44+esp]
+	rol	esi,1
+	add	edi,ebp
+	ror	ebx,2
+	mov	ebp,eax
+	rol	ebp,5
+	lea	esi,[3395469782+edi*1+esi]
+	mov	edi,DWORD [60+esp]
+	add	esi,ebp
+	; 20_39 79
+	mov	ebp,eax
+	xor	edi,DWORD [4+esp]
+	xor	ebp,ebx
+	xor	edi,DWORD [28+esp]
+	xor	ebp,ecx
+	xor	edi,DWORD [48+esp]
+	rol	edi,1
+	add	edx,ebp
+	ror	eax,2
+	mov	ebp,esi
+	rol	ebp,5
+	lea	edi,[3395469782+edx*1+edi]
+	add	edi,ebp
+	mov	ebp,DWORD [96+esp]
+	mov	edx,DWORD [100+esp]
+	add	edi,DWORD [ebp]
+	add	esi,DWORD [4+ebp]
+	add	eax,DWORD [8+ebp]
+	add	ebx,DWORD [12+ebp]
+	add	ecx,DWORD [16+ebp]
+	mov	DWORD [ebp],edi
+	add	edx,64
+	mov	DWORD [4+ebp],esi
+	cmp	edx,DWORD [104+esp]
+	mov	DWORD [8+ebp],eax
+	mov	edi,ecx
+	mov	DWORD [12+ebp],ebx
+	mov	esi,edx
+	mov	DWORD [16+ebp],ecx
+	jb	NEAR L$000loop
+	add	esp,76
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_sha1_block_data_order_ssse3
+align	16
+_sha1_block_data_order_ssse3:
+L$_sha1_block_data_order_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	call	L$001pic_point
+L$001pic_point:
+	pop	ebp
+	lea	ebp,[(L$K_XX_XX-L$001pic_point)+ebp]
+	movdqa	xmm7,[ebp]
+	movdqa	xmm0,[16+ebp]
+	movdqa	xmm1,[32+ebp]
+	movdqa	xmm2,[48+ebp]
+	movdqa	xmm6,[64+ebp]
+	mov	edi,DWORD [20+esp]
+	mov	ebp,DWORD [24+esp]
+	mov	edx,DWORD [28+esp]
+	mov	esi,esp
+	sub	esp,208
+	and	esp,-64
+	movdqa	[112+esp],xmm0
+	movdqa	[128+esp],xmm1
+	movdqa	[144+esp],xmm2
+	shl	edx,6
+	movdqa	[160+esp],xmm7
+	add	edx,ebp
+	movdqa	[176+esp],xmm6
+	add	ebp,64
+	mov	DWORD [192+esp],edi
+	mov	DWORD [196+esp],ebp
+	mov	DWORD [200+esp],edx
+	mov	DWORD [204+esp],esi
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	mov	edi,DWORD [16+edi]
+	mov	esi,ebx
+	movdqu	xmm0,[ebp-64]
+	movdqu	xmm1,[ebp-48]
+	movdqu	xmm2,[ebp-32]
+	movdqu	xmm3,[ebp-16]
+db	102,15,56,0,198
+db	102,15,56,0,206
+db	102,15,56,0,214
+	movdqa	[96+esp],xmm7
+db	102,15,56,0,222
+	paddd	xmm0,xmm7
+	paddd	xmm1,xmm7
+	paddd	xmm2,xmm7
+	movdqa	[esp],xmm0
+	psubd	xmm0,xmm7
+	movdqa	[16+esp],xmm1
+	psubd	xmm1,xmm7
+	movdqa	[32+esp],xmm2
+	mov	ebp,ecx
+	psubd	xmm2,xmm7
+	xor	ebp,edx
+	pshufd	xmm4,xmm0,238
+	and	esi,ebp
+	jmp	NEAR L$002loop
+align	16
+L$002loop:
+	ror	ebx,2
+	xor	esi,edx
+	mov	ebp,eax
+	punpcklqdq	xmm4,xmm1
+	movdqa	xmm6,xmm3
+	add	edi,DWORD [esp]
+	xor	ebx,ecx
+	paddd	xmm7,xmm3
+	movdqa	[64+esp],xmm0
+	rol	eax,5
+	add	edi,esi
+	psrldq	xmm6,4
+	and	ebp,ebx
+	xor	ebx,ecx
+	pxor	xmm4,xmm0
+	add	edi,eax
+	ror	eax,7
+	pxor	xmm6,xmm2
+	xor	ebp,ecx
+	mov	esi,edi
+	add	edx,DWORD [4+esp]
+	pxor	xmm4,xmm6
+	xor	eax,ebx
+	rol	edi,5
+	movdqa	[48+esp],xmm7
+	add	edx,ebp
+	and	esi,eax
+	movdqa	xmm0,xmm4
+	xor	eax,ebx
+	add	edx,edi
+	ror	edi,7
+	movdqa	xmm6,xmm4
+	xor	esi,ebx
+	pslldq	xmm0,12
+	paddd	xmm4,xmm4
+	mov	ebp,edx
+	add	ecx,DWORD [8+esp]
+	psrld	xmm6,31
+	xor	edi,eax
+	rol	edx,5
+	movdqa	xmm7,xmm0
+	add	ecx,esi
+	and	ebp,edi
+	xor	edi,eax
+	psrld	xmm0,30
+	add	ecx,edx
+	ror	edx,7
+	por	xmm4,xmm6
+	xor	ebp,eax
+	mov	esi,ecx
+	add	ebx,DWORD [12+esp]
+	pslld	xmm7,2
+	xor	edx,edi
+	rol	ecx,5
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[96+esp]
+	add	ebx,ebp
+	and	esi,edx
+	pxor	xmm4,xmm7
+	pshufd	xmm5,xmm1,238
+	xor	edx,edi
+	add	ebx,ecx
+	ror	ecx,7
+	xor	esi,edi
+	mov	ebp,ebx
+	punpcklqdq	xmm5,xmm2
+	movdqa	xmm7,xmm4
+	add	eax,DWORD [16+esp]
+	xor	ecx,edx
+	paddd	xmm0,xmm4
+	movdqa	[80+esp],xmm1
+	rol	ebx,5
+	add	eax,esi
+	psrldq	xmm7,4
+	and	ebp,ecx
+	xor	ecx,edx
+	pxor	xmm5,xmm1
+	add	eax,ebx
+	ror	ebx,7
+	pxor	xmm7,xmm3
+	xor	ebp,edx
+	mov	esi,eax
+	add	edi,DWORD [20+esp]
+	pxor	xmm5,xmm7
+	xor	ebx,ecx
+	rol	eax,5
+	movdqa	[esp],xmm0
+	add	edi,ebp
+	and	esi,ebx
+	movdqa	xmm1,xmm5
+	xor	ebx,ecx
+	add	edi,eax
+	ror	eax,7
+	movdqa	xmm7,xmm5
+	xor	esi,ecx
+	pslldq	xmm1,12
+	paddd	xmm5,xmm5
+	mov	ebp,edi
+	add	edx,DWORD [24+esp]
+	psrld	xmm7,31
+	xor	eax,ebx
+	rol	edi,5
+	movdqa	xmm0,xmm1
+	add	edx,esi
+	and	ebp,eax
+	xor	eax,ebx
+	psrld	xmm1,30
+	add	edx,edi
+	ror	edi,7
+	por	xmm5,xmm7
+	xor	ebp,ebx
+	mov	esi,edx
+	add	ecx,DWORD [28+esp]
+	pslld	xmm0,2
+	xor	edi,eax
+	rol	edx,5
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[112+esp]
+	add	ecx,ebp
+	and	esi,edi
+	pxor	xmm5,xmm0
+	pshufd	xmm6,xmm2,238
+	xor	edi,eax
+	add	ecx,edx
+	ror	edx,7
+	xor	esi,eax
+	mov	ebp,ecx
+	punpcklqdq	xmm6,xmm3
+	movdqa	xmm0,xmm5
+	add	ebx,DWORD [32+esp]
+	xor	edx,edi
+	paddd	xmm1,xmm5
+	movdqa	[96+esp],xmm2
+	rol	ecx,5
+	add	ebx,esi
+	psrldq	xmm0,4
+	and	ebp,edx
+	xor	edx,edi
+	pxor	xmm6,xmm2
+	add	ebx,ecx
+	ror	ecx,7
+	pxor	xmm0,xmm4
+	xor	ebp,edi
+	mov	esi,ebx
+	add	eax,DWORD [36+esp]
+	pxor	xmm6,xmm0
+	xor	ecx,edx
+	rol	ebx,5
+	movdqa	[16+esp],xmm1
+	add	eax,ebp
+	and	esi,ecx
+	movdqa	xmm2,xmm6
+	xor	ecx,edx
+	add	eax,ebx
+	ror	ebx,7
+	movdqa	xmm0,xmm6
+	xor	esi,edx
+	pslldq	xmm2,12
+	paddd	xmm6,xmm6
+	mov	ebp,eax
+	add	edi,DWORD [40+esp]
+	psrld	xmm0,31
+	xor	ebx,ecx
+	rol	eax,5
+	movdqa	xmm1,xmm2
+	add	edi,esi
+	and	ebp,ebx
+	xor	ebx,ecx
+	psrld	xmm2,30
+	add	edi,eax
+	ror	eax,7
+	por	xmm6,xmm0
+	xor	ebp,ecx
+	movdqa	xmm0,[64+esp]
+	mov	esi,edi
+	add	edx,DWORD [44+esp]
+	pslld	xmm1,2
+	xor	eax,ebx
+	rol	edi,5
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[112+esp]
+	add	edx,ebp
+	and	esi,eax
+	pxor	xmm6,xmm1
+	pshufd	xmm7,xmm3,238
+	xor	eax,ebx
+	add	edx,edi
+	ror	edi,7
+	xor	esi,ebx
+	mov	ebp,edx
+	punpcklqdq	xmm7,xmm4
+	movdqa	xmm1,xmm6
+	add	ecx,DWORD [48+esp]
+	xor	edi,eax
+	paddd	xmm2,xmm6
+	movdqa	[64+esp],xmm3
+	rol	edx,5
+	add	ecx,esi
+	psrldq	xmm1,4
+	and	ebp,edi
+	xor	edi,eax
+	pxor	xmm7,xmm3
+	add	ecx,edx
+	ror	edx,7
+	pxor	xmm1,xmm5
+	xor	ebp,eax
+	mov	esi,ecx
+	add	ebx,DWORD [52+esp]
+	pxor	xmm7,xmm1
+	xor	edx,edi
+	rol	ecx,5
+	movdqa	[32+esp],xmm2
+	add	ebx,ebp
+	and	esi,edx
+	movdqa	xmm3,xmm7
+	xor	edx,edi
+	add	ebx,ecx
+	ror	ecx,7
+	movdqa	xmm1,xmm7
+	xor	esi,edi
+	pslldq	xmm3,12
+	paddd	xmm7,xmm7
+	mov	ebp,ebx
+	add	eax,DWORD [56+esp]
+	psrld	xmm1,31
+	xor	ecx,edx
+	rol	ebx,5
+	movdqa	xmm2,xmm3
+	add	eax,esi
+	and	ebp,ecx
+	xor	ecx,edx
+	psrld	xmm3,30
+	add	eax,ebx
+	ror	ebx,7
+	por	xmm7,xmm1
+	xor	ebp,edx
+	movdqa	xmm1,[80+esp]
+	mov	esi,eax
+	add	edi,DWORD [60+esp]
+	pslld	xmm2,2
+	xor	ebx,ecx
+	rol	eax,5
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[112+esp]
+	add	edi,ebp
+	and	esi,ebx
+	pxor	xmm7,xmm2
+	pshufd	xmm2,xmm6,238
+	xor	ebx,ecx
+	add	edi,eax
+	ror	eax,7
+	pxor	xmm0,xmm4
+	punpcklqdq	xmm2,xmm7
+	xor	esi,ecx
+	mov	ebp,edi
+	add	edx,DWORD [esp]
+	pxor	xmm0,xmm1
+	movdqa	[80+esp],xmm4
+	xor	eax,ebx
+	rol	edi,5
+	movdqa	xmm4,xmm3
+	add	edx,esi
+	paddd	xmm3,xmm7
+	and	ebp,eax
+	pxor	xmm0,xmm2
+	xor	eax,ebx
+	add	edx,edi
+	ror	edi,7
+	xor	ebp,ebx
+	movdqa	xmm2,xmm0
+	movdqa	[48+esp],xmm3
+	mov	esi,edx
+	add	ecx,DWORD [4+esp]
+	xor	edi,eax
+	rol	edx,5
+	pslld	xmm0,2
+	add	ecx,ebp
+	and	esi,edi
+	psrld	xmm2,30
+	xor	edi,eax
+	add	ecx,edx
+	ror	edx,7
+	xor	esi,eax
+	mov	ebp,ecx
+	add	ebx,DWORD [8+esp]
+	xor	edx,edi
+	rol	ecx,5
+	por	xmm0,xmm2
+	add	ebx,esi
+	and	ebp,edx
+	movdqa	xmm2,[96+esp]
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [12+esp]
+	xor	ebp,edi
+	mov	esi,ebx
+	pshufd	xmm3,xmm7,238
+	rol	ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	edi,DWORD [16+esp]
+	pxor	xmm1,xmm5
+	punpcklqdq	xmm3,xmm0
+	xor	esi,ecx
+	mov	ebp,eax
+	rol	eax,5
+	pxor	xmm1,xmm2
+	movdqa	[96+esp],xmm5
+	add	edi,esi
+	xor	ebp,ecx
+	movdqa	xmm5,xmm4
+	ror	ebx,7
+	paddd	xmm4,xmm0
+	add	edi,eax
+	pxor	xmm1,xmm3
+	add	edx,DWORD [20+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	rol	edi,5
+	movdqa	xmm3,xmm1
+	movdqa	[esp],xmm4
+	add	edx,ebp
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,edi
+	pslld	xmm1,2
+	add	ecx,DWORD [24+esp]
+	xor	esi,eax
+	psrld	xmm3,30
+	mov	ebp,edx
+	rol	edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	ror	edi,7
+	add	ecx,edx
+	por	xmm1,xmm3
+	add	ebx,DWORD [28+esp]
+	xor	ebp,edi
+	movdqa	xmm3,[64+esp]
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	ror	edx,7
+	pshufd	xmm4,xmm0,238
+	add	ebx,ecx
+	add	eax,DWORD [32+esp]
+	pxor	xmm2,xmm6
+	punpcklqdq	xmm4,xmm1
+	xor	esi,edx
+	mov	ebp,ebx
+	rol	ebx,5
+	pxor	xmm2,xmm3
+	movdqa	[64+esp],xmm6
+	add	eax,esi
+	xor	ebp,edx
+	movdqa	xmm6,[128+esp]
+	ror	ecx,7
+	paddd	xmm5,xmm1
+	add	eax,ebx
+	pxor	xmm2,xmm4
+	add	edi,DWORD [36+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	rol	eax,5
+	movdqa	xmm4,xmm2
+	movdqa	[16+esp],xmm5
+	add	edi,ebp
+	xor	esi,ecx
+	ror	ebx,7
+	add	edi,eax
+	pslld	xmm2,2
+	add	edx,DWORD [40+esp]
+	xor	esi,ebx
+	psrld	xmm4,30
+	mov	ebp,edi
+	rol	edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	ror	eax,7
+	add	edx,edi
+	por	xmm2,xmm4
+	add	ecx,DWORD [44+esp]
+	xor	ebp,eax
+	movdqa	xmm4,[80+esp]
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	ror	edi,7
+	pshufd	xmm5,xmm1,238
+	add	ecx,edx
+	add	ebx,DWORD [48+esp]
+	pxor	xmm3,xmm7
+	punpcklqdq	xmm5,xmm2
+	xor	esi,edi
+	mov	ebp,ecx
+	rol	ecx,5
+	pxor	xmm3,xmm4
+	movdqa	[80+esp],xmm7
+	add	ebx,esi
+	xor	ebp,edi
+	movdqa	xmm7,xmm6
+	ror	edx,7
+	paddd	xmm6,xmm2
+	add	ebx,ecx
+	pxor	xmm3,xmm5
+	add	eax,DWORD [52+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	rol	ebx,5
+	movdqa	xmm5,xmm3
+	movdqa	[32+esp],xmm6
+	add	eax,ebp
+	xor	esi,edx
+	ror	ecx,7
+	add	eax,ebx
+	pslld	xmm3,2
+	add	edi,DWORD [56+esp]
+	xor	esi,ecx
+	psrld	xmm5,30
+	mov	ebp,eax
+	rol	eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	ror	ebx,7
+	add	edi,eax
+	por	xmm3,xmm5
+	add	edx,DWORD [60+esp]
+	xor	ebp,ebx
+	movdqa	xmm5,[96+esp]
+	mov	esi,edi
+	rol	edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	ror	eax,7
+	pshufd	xmm6,xmm2,238
+	add	edx,edi
+	add	ecx,DWORD [esp]
+	pxor	xmm4,xmm0
+	punpcklqdq	xmm6,xmm3
+	xor	esi,eax
+	mov	ebp,edx
+	rol	edx,5
+	pxor	xmm4,xmm5
+	movdqa	[96+esp],xmm0
+	add	ecx,esi
+	xor	ebp,eax
+	movdqa	xmm0,xmm7
+	ror	edi,7
+	paddd	xmm7,xmm3
+	add	ecx,edx
+	pxor	xmm4,xmm6
+	add	ebx,DWORD [4+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	rol	ecx,5
+	movdqa	xmm6,xmm4
+	movdqa	[48+esp],xmm7
+	add	ebx,ebp
+	xor	esi,edi
+	ror	edx,7
+	add	ebx,ecx
+	pslld	xmm4,2
+	add	eax,DWORD [8+esp]
+	xor	esi,edx
+	psrld	xmm6,30
+	mov	ebp,ebx
+	rol	ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	ror	ecx,7
+	add	eax,ebx
+	por	xmm4,xmm6
+	add	edi,DWORD [12+esp]
+	xor	ebp,ecx
+	movdqa	xmm6,[64+esp]
+	mov	esi,eax
+	rol	eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	ror	ebx,7
+	pshufd	xmm7,xmm3,238
+	add	edi,eax
+	add	edx,DWORD [16+esp]
+	pxor	xmm5,xmm1
+	punpcklqdq	xmm7,xmm4
+	xor	esi,ebx
+	mov	ebp,edi
+	rol	edi,5
+	pxor	xmm5,xmm6
+	movdqa	[64+esp],xmm1
+	add	edx,esi
+	xor	ebp,ebx
+	movdqa	xmm1,xmm0
+	ror	eax,7
+	paddd	xmm0,xmm4
+	add	edx,edi
+	pxor	xmm5,xmm7
+	add	ecx,DWORD [20+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	rol	edx,5
+	movdqa	xmm7,xmm5
+	movdqa	[esp],xmm0
+	add	ecx,ebp
+	xor	esi,eax
+	ror	edi,7
+	add	ecx,edx
+	pslld	xmm5,2
+	add	ebx,DWORD [24+esp]
+	xor	esi,edi
+	psrld	xmm7,30
+	mov	ebp,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	ror	edx,7
+	add	ebx,ecx
+	por	xmm5,xmm7
+	add	eax,DWORD [28+esp]
+	movdqa	xmm7,[80+esp]
+	ror	ecx,7
+	mov	esi,ebx
+	xor	ebp,edx
+	rol	ebx,5
+	pshufd	xmm0,xmm4,238
+	add	eax,ebp
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [32+esp]
+	pxor	xmm6,xmm2
+	punpcklqdq	xmm0,xmm5
+	and	esi,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	pxor	xmm6,xmm7
+	movdqa	[80+esp],xmm2
+	mov	ebp,eax
+	xor	esi,ecx
+	rol	eax,5
+	movdqa	xmm2,xmm1
+	add	edi,esi
+	paddd	xmm1,xmm5
+	xor	ebp,ebx
+	pxor	xmm6,xmm0
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [36+esp]
+	and	ebp,ebx
+	movdqa	xmm0,xmm6
+	movdqa	[16+esp],xmm1
+	xor	ebx,ecx
+	ror	eax,7
+	mov	esi,edi
+	xor	ebp,ebx
+	rol	edi,5
+	pslld	xmm6,2
+	add	edx,ebp
+	xor	esi,eax
+	psrld	xmm0,30
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [40+esp]
+	and	esi,eax
+	xor	eax,ebx
+	ror	edi,7
+	por	xmm6,xmm0
+	mov	ebp,edx
+	xor	esi,eax
+	movdqa	xmm0,[96+esp]
+	rol	edx,5
+	add	ecx,esi
+	xor	ebp,edi
+	xor	edi,eax
+	add	ecx,edx
+	pshufd	xmm1,xmm5,238
+	add	ebx,DWORD [44+esp]
+	and	ebp,edi
+	xor	edi,eax
+	ror	edx,7
+	mov	esi,ecx
+	xor	ebp,edi
+	rol	ecx,5
+	add	ebx,ebp
+	xor	esi,edx
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [48+esp]
+	pxor	xmm7,xmm3
+	punpcklqdq	xmm1,xmm6
+	and	esi,edx
+	xor	edx,edi
+	ror	ecx,7
+	pxor	xmm7,xmm0
+	movdqa	[96+esp],xmm3
+	mov	ebp,ebx
+	xor	esi,edx
+	rol	ebx,5
+	movdqa	xmm3,[144+esp]
+	add	eax,esi
+	paddd	xmm2,xmm6
+	xor	ebp,ecx
+	pxor	xmm7,xmm1
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [52+esp]
+	and	ebp,ecx
+	movdqa	xmm1,xmm7
+	movdqa	[32+esp],xmm2
+	xor	ecx,edx
+	ror	ebx,7
+	mov	esi,eax
+	xor	ebp,ecx
+	rol	eax,5
+	pslld	xmm7,2
+	add	edi,ebp
+	xor	esi,ebx
+	psrld	xmm1,30
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [56+esp]
+	and	esi,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	por	xmm7,xmm1
+	mov	ebp,edi
+	xor	esi,ebx
+	movdqa	xmm1,[64+esp]
+	rol	edi,5
+	add	edx,esi
+	xor	ebp,eax
+	xor	eax,ebx
+	add	edx,edi
+	pshufd	xmm2,xmm6,238
+	add	ecx,DWORD [60+esp]
+	and	ebp,eax
+	xor	eax,ebx
+	ror	edi,7
+	mov	esi,edx
+	xor	ebp,eax
+	rol	edx,5
+	add	ecx,ebp
+	xor	esi,edi
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [esp]
+	pxor	xmm0,xmm4
+	punpcklqdq	xmm2,xmm7
+	and	esi,edi
+	xor	edi,eax
+	ror	edx,7
+	pxor	xmm0,xmm1
+	movdqa	[64+esp],xmm4
+	mov	ebp,ecx
+	xor	esi,edi
+	rol	ecx,5
+	movdqa	xmm4,xmm3
+	add	ebx,esi
+	paddd	xmm3,xmm7
+	xor	ebp,edx
+	pxor	xmm0,xmm2
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [4+esp]
+	and	ebp,edx
+	movdqa	xmm2,xmm0
+	movdqa	[48+esp],xmm3
+	xor	edx,edi
+	ror	ecx,7
+	mov	esi,ebx
+	xor	ebp,edx
+	rol	ebx,5
+	pslld	xmm0,2
+	add	eax,ebp
+	xor	esi,ecx
+	psrld	xmm2,30
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [8+esp]
+	and	esi,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	por	xmm0,xmm2
+	mov	ebp,eax
+	xor	esi,ecx
+	movdqa	xmm2,[80+esp]
+	rol	eax,5
+	add	edi,esi
+	xor	ebp,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	pshufd	xmm3,xmm7,238
+	add	edx,DWORD [12+esp]
+	and	ebp,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	mov	esi,edi
+	xor	ebp,ebx
+	rol	edi,5
+	add	edx,ebp
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [16+esp]
+	pxor	xmm1,xmm5
+	punpcklqdq	xmm3,xmm0
+	and	esi,eax
+	xor	eax,ebx
+	ror	edi,7
+	pxor	xmm1,xmm2
+	movdqa	[80+esp],xmm5
+	mov	ebp,edx
+	xor	esi,eax
+	rol	edx,5
+	movdqa	xmm5,xmm4
+	add	ecx,esi
+	paddd	xmm4,xmm0
+	xor	ebp,edi
+	pxor	xmm1,xmm3
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [20+esp]
+	and	ebp,edi
+	movdqa	xmm3,xmm1
+	movdqa	[esp],xmm4
+	xor	edi,eax
+	ror	edx,7
+	mov	esi,ecx
+	xor	ebp,edi
+	rol	ecx,5
+	pslld	xmm1,2
+	add	ebx,ebp
+	xor	esi,edx
+	psrld	xmm3,30
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [24+esp]
+	and	esi,edx
+	xor	edx,edi
+	ror	ecx,7
+	por	xmm1,xmm3
+	mov	ebp,ebx
+	xor	esi,edx
+	movdqa	xmm3,[96+esp]
+	rol	ebx,5
+	add	eax,esi
+	xor	ebp,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	pshufd	xmm4,xmm0,238
+	add	edi,DWORD [28+esp]
+	and	ebp,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	mov	esi,eax
+	xor	ebp,ecx
+	rol	eax,5
+	add	edi,ebp
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [32+esp]
+	pxor	xmm2,xmm6
+	punpcklqdq	xmm4,xmm1
+	and	esi,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	pxor	xmm2,xmm3
+	movdqa	[96+esp],xmm6
+	mov	ebp,edi
+	xor	esi,ebx
+	rol	edi,5
+	movdqa	xmm6,xmm5
+	add	edx,esi
+	paddd	xmm5,xmm1
+	xor	ebp,eax
+	pxor	xmm2,xmm4
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [36+esp]
+	and	ebp,eax
+	movdqa	xmm4,xmm2
+	movdqa	[16+esp],xmm5
+	xor	eax,ebx
+	ror	edi,7
+	mov	esi,edx
+	xor	ebp,eax
+	rol	edx,5
+	pslld	xmm2,2
+	add	ecx,ebp
+	xor	esi,edi
+	psrld	xmm4,30
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [40+esp]
+	and	esi,edi
+	xor	edi,eax
+	ror	edx,7
+	por	xmm2,xmm4
+	mov	ebp,ecx
+	xor	esi,edi
+	movdqa	xmm4,[64+esp]
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edx
+	xor	edx,edi
+	add	ebx,ecx
+	pshufd	xmm5,xmm1,238
+	add	eax,DWORD [44+esp]
+	and	ebp,edx
+	xor	edx,edi
+	ror	ecx,7
+	mov	esi,ebx
+	xor	ebp,edx
+	rol	ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	add	eax,ebx
+	add	edi,DWORD [48+esp]
+	pxor	xmm3,xmm7
+	punpcklqdq	xmm5,xmm2
+	xor	esi,ecx
+	mov	ebp,eax
+	rol	eax,5
+	pxor	xmm3,xmm4
+	movdqa	[64+esp],xmm7
+	add	edi,esi
+	xor	ebp,ecx
+	movdqa	xmm7,xmm6
+	ror	ebx,7
+	paddd	xmm6,xmm2
+	add	edi,eax
+	pxor	xmm3,xmm5
+	add	edx,DWORD [52+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	rol	edi,5
+	movdqa	xmm5,xmm3
+	movdqa	[32+esp],xmm6
+	add	edx,ebp
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,edi
+	pslld	xmm3,2
+	add	ecx,DWORD [56+esp]
+	xor	esi,eax
+	psrld	xmm5,30
+	mov	ebp,edx
+	rol	edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	ror	edi,7
+	add	ecx,edx
+	por	xmm3,xmm5
+	add	ebx,DWORD [60+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD [esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	rol	ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	ror	ecx,7
+	paddd	xmm7,xmm3
+	add	eax,ebx
+	add	edi,DWORD [4+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	movdqa	[48+esp],xmm7
+	rol	eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	ror	ebx,7
+	add	edi,eax
+	add	edx,DWORD [8+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	rol	edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	ror	eax,7
+	add	edx,edi
+	add	ecx,DWORD [12+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	ror	edi,7
+	add	ecx,edx
+	mov	ebp,DWORD [196+esp]
+	cmp	ebp,DWORD [200+esp]
+	je	NEAR L$003done
+	movdqa	xmm7,[160+esp]
+	movdqa	xmm6,[176+esp]
+	movdqu	xmm0,[ebp]
+	movdqu	xmm1,[16+ebp]
+	movdqu	xmm2,[32+ebp]
+	movdqu	xmm3,[48+ebp]
+	add	ebp,64
+db	102,15,56,0,198
+	mov	DWORD [196+esp],ebp
+	movdqa	[96+esp],xmm7
+	add	ebx,DWORD [16+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	ror	edx,7
+db	102,15,56,0,206
+	add	ebx,ecx
+	add	eax,DWORD [20+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	paddd	xmm0,xmm7
+	rol	ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	ror	ecx,7
+	movdqa	[esp],xmm0
+	add	eax,ebx
+	add	edi,DWORD [24+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	psubd	xmm0,xmm7
+	rol	eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	ror	ebx,7
+	add	edi,eax
+	add	edx,DWORD [28+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	rol	edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,edi
+	add	ecx,DWORD [32+esp]
+	xor	esi,eax
+	mov	ebp,edx
+	rol	edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	ror	edi,7
+db	102,15,56,0,214
+	add	ecx,edx
+	add	ebx,DWORD [36+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	paddd	xmm1,xmm7
+	rol	ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	ror	edx,7
+	movdqa	[16+esp],xmm1
+	add	ebx,ecx
+	add	eax,DWORD [40+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	psubd	xmm1,xmm7
+	rol	ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	edi,DWORD [44+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	ror	ebx,7
+	add	edi,eax
+	add	edx,DWORD [48+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	rol	edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	ror	eax,7
+db	102,15,56,0,222
+	add	edx,edi
+	add	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	paddd	xmm2,xmm7
+	rol	edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	ror	edi,7
+	movdqa	[32+esp],xmm2
+	add	ecx,edx
+	add	ebx,DWORD [56+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	psubd	xmm2,xmm7
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD [60+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,ebp
+	ror	ecx,7
+	add	eax,ebx
+	mov	ebp,DWORD [192+esp]
+	add	eax,DWORD [ebp]
+	add	esi,DWORD [4+ebp]
+	add	ecx,DWORD [8+ebp]
+	mov	DWORD [ebp],eax
+	add	edx,DWORD [12+ebp]
+	mov	DWORD [4+ebp],esi
+	add	edi,DWORD [16+ebp]
+	mov	DWORD [8+ebp],ecx
+	mov	ebx,ecx
+	mov	DWORD [12+ebp],edx
+	xor	ebx,edx
+	mov	DWORD [16+ebp],edi
+	mov	ebp,esi
+	pshufd	xmm4,xmm0,238
+	and	esi,ebx
+	mov	ebx,ebp
+	jmp	NEAR L$002loop
+align	16
+L$003done:
+	add	ebx,DWORD [16+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD [20+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	edi,DWORD [24+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	rol	eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	ror	ebx,7
+	add	edi,eax
+	add	edx,DWORD [28+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	rol	edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,edi
+	add	ecx,DWORD [32+esp]
+	xor	esi,eax
+	mov	ebp,edx
+	rol	edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	ror	edi,7
+	add	ecx,edx
+	add	ebx,DWORD [36+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD [40+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	rol	ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	edi,DWORD [44+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	ror	ebx,7
+	add	edi,eax
+	add	edx,DWORD [48+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	rol	edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	ror	eax,7
+	add	edx,edi
+	add	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	ror	edi,7
+	add	ecx,edx
+	add	ebx,DWORD [56+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD [60+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,ebp
+	ror	ecx,7
+	add	eax,ebx
+	mov	ebp,DWORD [192+esp]
+	add	eax,DWORD [ebp]
+	mov	esp,DWORD [204+esp]
+	add	esi,DWORD [4+ebp]
+	add	ecx,DWORD [8+ebp]
+	mov	DWORD [ebp],eax
+	add	edx,DWORD [12+ebp]
+	mov	DWORD [4+ebp],esi
+	add	edi,DWORD [16+ebp]
+	mov	DWORD [8+ebp],ecx
+	mov	DWORD [12+ebp],edx
+	mov	DWORD [16+ebp],edi
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_sha1_block_data_order_avx
+align	16
+_sha1_block_data_order_avx:
+L$_sha1_block_data_order_avx_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	call	L$004pic_point
+L$004pic_point:
+	pop	ebp
+	lea	ebp,[(L$K_XX_XX-L$004pic_point)+ebp]
+	vzeroall
+	vmovdqa	xmm7,[ebp]
+	vmovdqa	xmm0,[16+ebp]
+	vmovdqa	xmm1,[32+ebp]
+	vmovdqa	xmm2,[48+ebp]
+	vmovdqa	xmm6,[64+ebp]
+	mov	edi,DWORD [20+esp]
+	mov	ebp,DWORD [24+esp]
+	mov	edx,DWORD [28+esp]
+	mov	esi,esp
+	sub	esp,208
+	and	esp,-64
+	vmovdqa	[112+esp],xmm0
+	vmovdqa	[128+esp],xmm1
+	vmovdqa	[144+esp],xmm2
+	shl	edx,6
+	vmovdqa	[160+esp],xmm7
+	add	edx,ebp
+	vmovdqa	[176+esp],xmm6
+	add	ebp,64
+	mov	DWORD [192+esp],edi
+	mov	DWORD [196+esp],ebp
+	mov	DWORD [200+esp],edx
+	mov	DWORD [204+esp],esi
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	mov	edi,DWORD [16+edi]
+	mov	esi,ebx
+	vmovdqu	xmm0,[ebp-64]
+	vmovdqu	xmm1,[ebp-48]
+	vmovdqu	xmm2,[ebp-32]
+	vmovdqu	xmm3,[ebp-16]
+	vpshufb	xmm0,xmm0,xmm6
+	vpshufb	xmm1,xmm1,xmm6
+	vpshufb	xmm2,xmm2,xmm6
+	vmovdqa	[96+esp],xmm7
+	vpshufb	xmm3,xmm3,xmm6
+	vpaddd	xmm4,xmm0,xmm7
+	vpaddd	xmm5,xmm1,xmm7
+	vpaddd	xmm6,xmm2,xmm7
+	vmovdqa	[esp],xmm4
+	mov	ebp,ecx
+	vmovdqa	[16+esp],xmm5
+	xor	ebp,edx
+	vmovdqa	[32+esp],xmm6
+	and	esi,ebp
+	jmp	NEAR L$005loop
+align	16
+L$005loop:
+	shrd	ebx,ebx,2
+	xor	esi,edx
+	vpalignr	xmm4,xmm1,xmm0,8
+	mov	ebp,eax
+	add	edi,DWORD [esp]
+	vpaddd	xmm7,xmm7,xmm3
+	vmovdqa	[64+esp],xmm0
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vpsrldq	xmm6,xmm3,4
+	add	edi,esi
+	and	ebp,ebx
+	vpxor	xmm4,xmm4,xmm0
+	xor	ebx,ecx
+	add	edi,eax
+	vpxor	xmm6,xmm6,xmm2
+	shrd	eax,eax,7
+	xor	ebp,ecx
+	vmovdqa	[48+esp],xmm7
+	mov	esi,edi
+	add	edx,DWORD [4+esp]
+	vpxor	xmm4,xmm4,xmm6
+	xor	eax,ebx
+	shld	edi,edi,5
+	add	edx,ebp
+	and	esi,eax
+	vpsrld	xmm6,xmm4,31
+	xor	eax,ebx
+	add	edx,edi
+	shrd	edi,edi,7
+	xor	esi,ebx
+	vpslldq	xmm0,xmm4,12
+	vpaddd	xmm4,xmm4,xmm4
+	mov	ebp,edx
+	add	ecx,DWORD [8+esp]
+	xor	edi,eax
+	shld	edx,edx,5
+	vpsrld	xmm7,xmm0,30
+	vpor	xmm4,xmm4,xmm6
+	add	ecx,esi
+	and	ebp,edi
+	xor	edi,eax
+	add	ecx,edx
+	vpslld	xmm0,xmm0,2
+	shrd	edx,edx,7
+	xor	ebp,eax
+	vpxor	xmm4,xmm4,xmm7
+	mov	esi,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edx,edi
+	shld	ecx,ecx,5
+	vpxor	xmm4,xmm4,xmm0
+	add	ebx,ebp
+	and	esi,edx
+	vmovdqa	xmm0,[96+esp]
+	xor	edx,edi
+	add	ebx,ecx
+	shrd	ecx,ecx,7
+	xor	esi,edi
+	vpalignr	xmm5,xmm2,xmm1,8
+	mov	ebp,ebx
+	add	eax,DWORD [16+esp]
+	vpaddd	xmm0,xmm0,xmm4
+	vmovdqa	[80+esp],xmm1
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	vpsrldq	xmm7,xmm4,4
+	add	eax,esi
+	and	ebp,ecx
+	vpxor	xmm5,xmm5,xmm1
+	xor	ecx,edx
+	add	eax,ebx
+	vpxor	xmm7,xmm7,xmm3
+	shrd	ebx,ebx,7
+	xor	ebp,edx
+	vmovdqa	[esp],xmm0
+	mov	esi,eax
+	add	edi,DWORD [20+esp]
+	vpxor	xmm5,xmm5,xmm7
+	xor	ebx,ecx
+	shld	eax,eax,5
+	add	edi,ebp
+	and	esi,ebx
+	vpsrld	xmm7,xmm5,31
+	xor	ebx,ecx
+	add	edi,eax
+	shrd	eax,eax,7
+	xor	esi,ecx
+	vpslldq	xmm1,xmm5,12
+	vpaddd	xmm5,xmm5,xmm5
+	mov	ebp,edi
+	add	edx,DWORD [24+esp]
+	xor	eax,ebx
+	shld	edi,edi,5
+	vpsrld	xmm0,xmm1,30
+	vpor	xmm5,xmm5,xmm7
+	add	edx,esi
+	and	ebp,eax
+	xor	eax,ebx
+	add	edx,edi
+	vpslld	xmm1,xmm1,2
+	shrd	edi,edi,7
+	xor	ebp,ebx
+	vpxor	xmm5,xmm5,xmm0
+	mov	esi,edx
+	add	ecx,DWORD [28+esp]
+	xor	edi,eax
+	shld	edx,edx,5
+	vpxor	xmm5,xmm5,xmm1
+	add	ecx,ebp
+	and	esi,edi
+	vmovdqa	xmm1,[112+esp]
+	xor	edi,eax
+	add	ecx,edx
+	shrd	edx,edx,7
+	xor	esi,eax
+	vpalignr	xmm6,xmm3,xmm2,8
+	mov	ebp,ecx
+	add	ebx,DWORD [32+esp]
+	vpaddd	xmm1,xmm1,xmm5
+	vmovdqa	[96+esp],xmm2
+	xor	edx,edi
+	shld	ecx,ecx,5
+	vpsrldq	xmm0,xmm5,4
+	add	ebx,esi
+	and	ebp,edx
+	vpxor	xmm6,xmm6,xmm2
+	xor	edx,edi
+	add	ebx,ecx
+	vpxor	xmm0,xmm0,xmm4
+	shrd	ecx,ecx,7
+	xor	ebp,edi
+	vmovdqa	[16+esp],xmm1
+	mov	esi,ebx
+	add	eax,DWORD [36+esp]
+	vpxor	xmm6,xmm6,xmm0
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	and	esi,ecx
+	vpsrld	xmm0,xmm6,31
+	xor	ecx,edx
+	add	eax,ebx
+	shrd	ebx,ebx,7
+	xor	esi,edx
+	vpslldq	xmm2,xmm6,12
+	vpaddd	xmm6,xmm6,xmm6
+	mov	ebp,eax
+	add	edi,DWORD [40+esp]
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vpsrld	xmm1,xmm2,30
+	vpor	xmm6,xmm6,xmm0
+	add	edi,esi
+	and	ebp,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	vpslld	xmm2,xmm2,2
+	vmovdqa	xmm0,[64+esp]
+	shrd	eax,eax,7
+	xor	ebp,ecx
+	vpxor	xmm6,xmm6,xmm1
+	mov	esi,edi
+	add	edx,DWORD [44+esp]
+	xor	eax,ebx
+	shld	edi,edi,5
+	vpxor	xmm6,xmm6,xmm2
+	add	edx,ebp
+	and	esi,eax
+	vmovdqa	xmm2,[112+esp]
+	xor	eax,ebx
+	add	edx,edi
+	shrd	edi,edi,7
+	xor	esi,ebx
+	vpalignr	xmm7,xmm4,xmm3,8
+	mov	ebp,edx
+	add	ecx,DWORD [48+esp]
+	vpaddd	xmm2,xmm2,xmm6
+	vmovdqa	[64+esp],xmm3
+	xor	edi,eax
+	shld	edx,edx,5
+	vpsrldq	xmm1,xmm6,4
+	add	ecx,esi
+	and	ebp,edi
+	vpxor	xmm7,xmm7,xmm3
+	xor	edi,eax
+	add	ecx,edx
+	vpxor	xmm1,xmm1,xmm5
+	shrd	edx,edx,7
+	xor	ebp,eax
+	vmovdqa	[32+esp],xmm2
+	mov	esi,ecx
+	add	ebx,DWORD [52+esp]
+	vpxor	xmm7,xmm7,xmm1
+	xor	edx,edi
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	and	esi,edx
+	vpsrld	xmm1,xmm7,31
+	xor	edx,edi
+	add	ebx,ecx
+	shrd	ecx,ecx,7
+	xor	esi,edi
+	vpslldq	xmm3,xmm7,12
+	vpaddd	xmm7,xmm7,xmm7
+	mov	ebp,ebx
+	add	eax,DWORD [56+esp]
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	vpsrld	xmm2,xmm3,30
+	vpor	xmm7,xmm7,xmm1
+	add	eax,esi
+	and	ebp,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	vpslld	xmm3,xmm3,2
+	vmovdqa	xmm1,[80+esp]
+	shrd	ebx,ebx,7
+	xor	ebp,edx
+	vpxor	xmm7,xmm7,xmm2
+	mov	esi,eax
+	add	edi,DWORD [60+esp]
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vpxor	xmm7,xmm7,xmm3
+	add	edi,ebp
+	and	esi,ebx
+	vmovdqa	xmm3,[112+esp]
+	xor	ebx,ecx
+	add	edi,eax
+	vpalignr	xmm2,xmm7,xmm6,8
+	vpxor	xmm0,xmm0,xmm4
+	shrd	eax,eax,7
+	xor	esi,ecx
+	mov	ebp,edi
+	add	edx,DWORD [esp]
+	vpxor	xmm0,xmm0,xmm1
+	vmovdqa	[80+esp],xmm4
+	xor	eax,ebx
+	shld	edi,edi,5
+	vmovdqa	xmm4,xmm3
+	vpaddd	xmm3,xmm3,xmm7
+	add	edx,esi
+	and	ebp,eax
+	vpxor	xmm0,xmm0,xmm2
+	xor	eax,ebx
+	add	edx,edi
+	shrd	edi,edi,7
+	xor	ebp,ebx
+	vpsrld	xmm2,xmm0,30
+	vmovdqa	[48+esp],xmm3
+	mov	esi,edx
+	add	ecx,DWORD [4+esp]
+	xor	edi,eax
+	shld	edx,edx,5
+	vpslld	xmm0,xmm0,2
+	add	ecx,ebp
+	and	esi,edi
+	xor	edi,eax
+	add	ecx,edx
+	shrd	edx,edx,7
+	xor	esi,eax
+	mov	ebp,ecx
+	add	ebx,DWORD [8+esp]
+	vpor	xmm0,xmm0,xmm2
+	xor	edx,edi
+	shld	ecx,ecx,5
+	vmovdqa	xmm2,[96+esp]
+	add	ebx,esi
+	and	ebp,edx
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [12+esp]
+	xor	ebp,edi
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpalignr	xmm3,xmm0,xmm7,8
+	vpxor	xmm1,xmm1,xmm5
+	add	edi,DWORD [16+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	shld	eax,eax,5
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	[96+esp],xmm5
+	add	edi,esi
+	xor	ebp,ecx
+	vmovdqa	xmm5,xmm4
+	vpaddd	xmm4,xmm4,xmm0
+	shrd	ebx,ebx,7
+	add	edi,eax
+	vpxor	xmm1,xmm1,xmm3
+	add	edx,DWORD [20+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	shld	edi,edi,5
+	vpsrld	xmm3,xmm1,30
+	vmovdqa	[esp],xmm4
+	add	edx,ebp
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	vpslld	xmm1,xmm1,2
+	add	ecx,DWORD [24+esp]
+	xor	esi,eax
+	mov	ebp,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	vpor	xmm1,xmm1,xmm3
+	add	ebx,DWORD [28+esp]
+	xor	ebp,edi
+	vmovdqa	xmm3,[64+esp]
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpalignr	xmm4,xmm1,xmm0,8
+	vpxor	xmm2,xmm2,xmm6
+	add	eax,DWORD [32+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	shld	ebx,ebx,5
+	vpxor	xmm2,xmm2,xmm3
+	vmovdqa	[64+esp],xmm6
+	add	eax,esi
+	xor	ebp,edx
+	vmovdqa	xmm6,[128+esp]
+	vpaddd	xmm5,xmm5,xmm1
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpxor	xmm2,xmm2,xmm4
+	add	edi,DWORD [36+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	vpsrld	xmm4,xmm2,30
+	vmovdqa	[16+esp],xmm5
+	add	edi,ebp
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	vpslld	xmm2,xmm2,2
+	add	edx,DWORD [40+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	shld	edi,edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	vpor	xmm2,xmm2,xmm4
+	add	ecx,DWORD [44+esp]
+	xor	ebp,eax
+	vmovdqa	xmm4,[80+esp]
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	vpalignr	xmm5,xmm2,xmm1,8
+	vpxor	xmm3,xmm3,xmm7
+	add	ebx,DWORD [48+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	vpxor	xmm3,xmm3,xmm4
+	vmovdqa	[80+esp],xmm7
+	add	ebx,esi
+	xor	ebp,edi
+	vmovdqa	xmm7,xmm6
+	vpaddd	xmm6,xmm6,xmm2
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpxor	xmm3,xmm3,xmm5
+	add	eax,DWORD [52+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	vpsrld	xmm5,xmm3,30
+	vmovdqa	[32+esp],xmm6
+	add	eax,ebp
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpslld	xmm3,xmm3,2
+	add	edi,DWORD [56+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	shld	eax,eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	vpor	xmm3,xmm3,xmm5
+	add	edx,DWORD [60+esp]
+	xor	ebp,ebx
+	vmovdqa	xmm5,[96+esp]
+	mov	esi,edi
+	shld	edi,edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	vpalignr	xmm6,xmm3,xmm2,8
+	vpxor	xmm4,xmm4,xmm0
+	add	ecx,DWORD [esp]
+	xor	esi,eax
+	mov	ebp,edx
+	shld	edx,edx,5
+	vpxor	xmm4,xmm4,xmm5
+	vmovdqa	[96+esp],xmm0
+	add	ecx,esi
+	xor	ebp,eax
+	vmovdqa	xmm0,xmm7
+	vpaddd	xmm7,xmm7,xmm3
+	shrd	edi,edi,7
+	add	ecx,edx
+	vpxor	xmm4,xmm4,xmm6
+	add	ebx,DWORD [4+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	vpsrld	xmm6,xmm4,30
+	vmovdqa	[48+esp],xmm7
+	add	ebx,ebp
+	xor	esi,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpslld	xmm4,xmm4,2
+	add	eax,DWORD [8+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpor	xmm4,xmm4,xmm6
+	add	edi,DWORD [12+esp]
+	xor	ebp,ecx
+	vmovdqa	xmm6,[64+esp]
+	mov	esi,eax
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	vpalignr	xmm7,xmm4,xmm3,8
+	vpxor	xmm5,xmm5,xmm1
+	add	edx,DWORD [16+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	shld	edi,edi,5
+	vpxor	xmm5,xmm5,xmm6
+	vmovdqa	[64+esp],xmm1
+	add	edx,esi
+	xor	ebp,ebx
+	vmovdqa	xmm1,xmm0
+	vpaddd	xmm0,xmm0,xmm4
+	shrd	eax,eax,7
+	add	edx,edi
+	vpxor	xmm5,xmm5,xmm7
+	add	ecx,DWORD [20+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	vpsrld	xmm7,xmm5,30
+	vmovdqa	[esp],xmm0
+	add	ecx,ebp
+	xor	esi,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	vpslld	xmm5,xmm5,2
+	add	ebx,DWORD [24+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpor	xmm5,xmm5,xmm7
+	add	eax,DWORD [28+esp]
+	vmovdqa	xmm7,[80+esp]
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	xor	ebp,edx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	vpalignr	xmm0,xmm5,xmm4,8
+	vpxor	xmm6,xmm6,xmm2
+	add	edi,DWORD [32+esp]
+	and	esi,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	vpxor	xmm6,xmm6,xmm7
+	vmovdqa	[80+esp],xmm2
+	mov	ebp,eax
+	xor	esi,ecx
+	vmovdqa	xmm2,xmm1
+	vpaddd	xmm1,xmm1,xmm5
+	shld	eax,eax,5
+	add	edi,esi
+	vpxor	xmm6,xmm6,xmm0
+	xor	ebp,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [36+esp]
+	vpsrld	xmm0,xmm6,30
+	vmovdqa	[16+esp],xmm1
+	and	ebp,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	mov	esi,edi
+	vpslld	xmm6,xmm6,2
+	xor	ebp,ebx
+	shld	edi,edi,5
+	add	edx,ebp
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [40+esp]
+	and	esi,eax
+	vpor	xmm6,xmm6,xmm0
+	xor	eax,ebx
+	shrd	edi,edi,7
+	vmovdqa	xmm0,[96+esp]
+	mov	ebp,edx
+	xor	esi,eax
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	ebp,edi
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [44+esp]
+	and	ebp,edi
+	xor	edi,eax
+	shrd	edx,edx,7
+	mov	esi,ecx
+	xor	ebp,edi
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edx
+	xor	edx,edi
+	add	ebx,ecx
+	vpalignr	xmm1,xmm6,xmm5,8
+	vpxor	xmm7,xmm7,xmm3
+	add	eax,DWORD [48+esp]
+	and	esi,edx
+	xor	edx,edi
+	shrd	ecx,ecx,7
+	vpxor	xmm7,xmm7,xmm0
+	vmovdqa	[96+esp],xmm3
+	mov	ebp,ebx
+	xor	esi,edx
+	vmovdqa	xmm3,[144+esp]
+	vpaddd	xmm2,xmm2,xmm6
+	shld	ebx,ebx,5
+	add	eax,esi
+	vpxor	xmm7,xmm7,xmm1
+	xor	ebp,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [52+esp]
+	vpsrld	xmm1,xmm7,30
+	vmovdqa	[32+esp],xmm2
+	and	ebp,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	mov	esi,eax
+	vpslld	xmm7,xmm7,2
+	xor	ebp,ecx
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [56+esp]
+	and	esi,ebx
+	vpor	xmm7,xmm7,xmm1
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	vmovdqa	xmm1,[64+esp]
+	mov	ebp,edi
+	xor	esi,ebx
+	shld	edi,edi,5
+	add	edx,esi
+	xor	ebp,eax
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [60+esp]
+	and	ebp,eax
+	xor	eax,ebx
+	shrd	edi,edi,7
+	mov	esi,edx
+	xor	ebp,eax
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,edi
+	xor	edi,eax
+	add	ecx,edx
+	vpalignr	xmm2,xmm7,xmm6,8
+	vpxor	xmm0,xmm0,xmm4
+	add	ebx,DWORD [esp]
+	and	esi,edi
+	xor	edi,eax
+	shrd	edx,edx,7
+	vpxor	xmm0,xmm0,xmm1
+	vmovdqa	[64+esp],xmm4
+	mov	ebp,ecx
+	xor	esi,edi
+	vmovdqa	xmm4,xmm3
+	vpaddd	xmm3,xmm3,xmm7
+	shld	ecx,ecx,5
+	add	ebx,esi
+	vpxor	xmm0,xmm0,xmm2
+	xor	ebp,edx
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [4+esp]
+	vpsrld	xmm2,xmm0,30
+	vmovdqa	[48+esp],xmm3
+	and	ebp,edx
+	xor	edx,edi
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	vpslld	xmm0,xmm0,2
+	xor	ebp,edx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [8+esp]
+	and	esi,ecx
+	vpor	xmm0,xmm0,xmm2
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	vmovdqa	xmm2,[80+esp]
+	mov	ebp,eax
+	xor	esi,ecx
+	shld	eax,eax,5
+	add	edi,esi
+	xor	ebp,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	add	edx,DWORD [12+esp]
+	and	ebp,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	mov	esi,edi
+	xor	ebp,ebx
+	shld	edi,edi,5
+	add	edx,ebp
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,edi
+	vpalignr	xmm3,xmm0,xmm7,8
+	vpxor	xmm1,xmm1,xmm5
+	add	ecx,DWORD [16+esp]
+	and	esi,eax
+	xor	eax,ebx
+	shrd	edi,edi,7
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	[80+esp],xmm5
+	mov	ebp,edx
+	xor	esi,eax
+	vmovdqa	xmm5,xmm4
+	vpaddd	xmm4,xmm4,xmm0
+	shld	edx,edx,5
+	add	ecx,esi
+	vpxor	xmm1,xmm1,xmm3
+	xor	ebp,edi
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [20+esp]
+	vpsrld	xmm3,xmm1,30
+	vmovdqa	[esp],xmm4
+	and	ebp,edi
+	xor	edi,eax
+	shrd	edx,edx,7
+	mov	esi,ecx
+	vpslld	xmm1,xmm1,2
+	xor	ebp,edi
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edx
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [24+esp]
+	and	esi,edx
+	vpor	xmm1,xmm1,xmm3
+	xor	edx,edi
+	shrd	ecx,ecx,7
+	vmovdqa	xmm3,[96+esp]
+	mov	ebp,ebx
+	xor	esi,edx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	ebp,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	edi,DWORD [28+esp]
+	and	ebp,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	mov	esi,eax
+	xor	ebp,ecx
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	edi,eax
+	vpalignr	xmm4,xmm1,xmm0,8
+	vpxor	xmm2,xmm2,xmm6
+	add	edx,DWORD [32+esp]
+	and	esi,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	vpxor	xmm2,xmm2,xmm3
+	vmovdqa	[96+esp],xmm6
+	mov	ebp,edi
+	xor	esi,ebx
+	vmovdqa	xmm6,xmm5
+	vpaddd	xmm5,xmm5,xmm1
+	shld	edi,edi,5
+	add	edx,esi
+	vpxor	xmm2,xmm2,xmm4
+	xor	ebp,eax
+	xor	eax,ebx
+	add	edx,edi
+	add	ecx,DWORD [36+esp]
+	vpsrld	xmm4,xmm2,30
+	vmovdqa	[16+esp],xmm5
+	and	ebp,eax
+	xor	eax,ebx
+	shrd	edi,edi,7
+	mov	esi,edx
+	vpslld	xmm2,xmm2,2
+	xor	ebp,eax
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,edi
+	xor	edi,eax
+	add	ecx,edx
+	add	ebx,DWORD [40+esp]
+	and	esi,edi
+	vpor	xmm2,xmm2,xmm4
+	xor	edi,eax
+	shrd	edx,edx,7
+	vmovdqa	xmm4,[64+esp]
+	mov	ebp,ecx
+	xor	esi,edi
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	ebp,edx
+	xor	edx,edi
+	add	ebx,ecx
+	add	eax,DWORD [44+esp]
+	and	ebp,edx
+	xor	edx,edi
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	xor	ebp,edx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	add	eax,ebx
+	vpalignr	xmm5,xmm2,xmm1,8
+	vpxor	xmm3,xmm3,xmm7
+	add	edi,DWORD [48+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	shld	eax,eax,5
+	vpxor	xmm3,xmm3,xmm4
+	vmovdqa	[64+esp],xmm7
+	add	edi,esi
+	xor	ebp,ecx
+	vmovdqa	xmm7,xmm6
+	vpaddd	xmm6,xmm6,xmm2
+	shrd	ebx,ebx,7
+	add	edi,eax
+	vpxor	xmm3,xmm3,xmm5
+	add	edx,DWORD [52+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	shld	edi,edi,5
+	vpsrld	xmm5,xmm3,30
+	vmovdqa	[32+esp],xmm6
+	add	edx,ebp
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	vpslld	xmm3,xmm3,2
+	add	ecx,DWORD [56+esp]
+	xor	esi,eax
+	mov	ebp,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	vpor	xmm3,xmm3,xmm5
+	add	ebx,DWORD [60+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [esp]
+	vpaddd	xmm7,xmm7,xmm3
+	xor	esi,edx
+	mov	ebp,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	vmovdqa	[48+esp],xmm7
+	xor	ebp,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	edi,DWORD [4+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	add	edx,DWORD [8+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	shld	edi,edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	add	ecx,DWORD [12+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	mov	ebp,DWORD [196+esp]
+	cmp	ebp,DWORD [200+esp]
+	je	NEAR L$006done
+	vmovdqa	xmm7,[160+esp]
+	vmovdqa	xmm6,[176+esp]
+	vmovdqu	xmm0,[ebp]
+	vmovdqu	xmm1,[16+ebp]
+	vmovdqu	xmm2,[32+ebp]
+	vmovdqu	xmm3,[48+ebp]
+	add	ebp,64
+	vpshufb	xmm0,xmm0,xmm6
+	mov	DWORD [196+esp],ebp
+	vmovdqa	[96+esp],xmm7
+	add	ebx,DWORD [16+esp]
+	xor	esi,edi
+	vpshufb	xmm1,xmm1,xmm6
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	vpaddd	xmm4,xmm0,xmm7
+	add	ebx,esi
+	xor	ebp,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vmovdqa	[esp],xmm4
+	add	eax,DWORD [20+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	edi,DWORD [24+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	shld	eax,eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	add	edx,DWORD [28+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	shld	edi,edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	add	ecx,DWORD [32+esp]
+	xor	esi,eax
+	vpshufb	xmm2,xmm2,xmm6
+	mov	ebp,edx
+	shld	edx,edx,5
+	vpaddd	xmm5,xmm1,xmm7
+	add	ecx,esi
+	xor	ebp,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	vmovdqa	[16+esp],xmm5
+	add	ebx,DWORD [36+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [40+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	edi,DWORD [44+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	add	edx,DWORD [48+esp]
+	xor	esi,ebx
+	vpshufb	xmm3,xmm3,xmm6
+	mov	ebp,edi
+	shld	edi,edi,5
+	vpaddd	xmm6,xmm2,xmm7
+	add	edx,esi
+	xor	ebp,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	vmovdqa	[32+esp],xmm6
+	add	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	add	ebx,DWORD [56+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [60+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	mov	ebp,DWORD [192+esp]
+	add	eax,DWORD [ebp]
+	add	esi,DWORD [4+ebp]
+	add	ecx,DWORD [8+ebp]
+	mov	DWORD [ebp],eax
+	add	edx,DWORD [12+ebp]
+	mov	DWORD [4+ebp],esi
+	add	edi,DWORD [16+ebp]
+	mov	ebx,ecx
+	mov	DWORD [8+ebp],ecx
+	xor	ebx,edx
+	mov	DWORD [12+ebp],edx
+	mov	DWORD [16+ebp],edi
+	mov	ebp,esi
+	and	esi,ebx
+	mov	ebx,ebp
+	jmp	NEAR L$005loop
+align	16
+L$006done:
+	add	ebx,DWORD [16+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [20+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	edi,DWORD [24+esp]
+	xor	esi,ecx
+	mov	ebp,eax
+	shld	eax,eax,5
+	add	edi,esi
+	xor	ebp,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	add	edx,DWORD [28+esp]
+	xor	ebp,ebx
+	mov	esi,edi
+	shld	edi,edi,5
+	add	edx,ebp
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	add	ecx,DWORD [32+esp]
+	xor	esi,eax
+	mov	ebp,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	ebp,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	add	ebx,DWORD [36+esp]
+	xor	ebp,edi
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,ebp
+	xor	esi,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [40+esp]
+	xor	esi,edx
+	mov	ebp,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	ebp,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	edi,DWORD [44+esp]
+	xor	ebp,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	edi,ebp
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	edi,eax
+	add	edx,DWORD [48+esp]
+	xor	esi,ebx
+	mov	ebp,edi
+	shld	edi,edi,5
+	add	edx,esi
+	xor	ebp,ebx
+	shrd	eax,eax,7
+	add	edx,edi
+	add	ecx,DWORD [52+esp]
+	xor	ebp,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,ebp
+	xor	esi,eax
+	shrd	edi,edi,7
+	add	ecx,edx
+	add	ebx,DWORD [56+esp]
+	xor	esi,edi
+	mov	ebp,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	ebp,edi
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD [60+esp]
+	xor	ebp,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,ebp
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vzeroall
+	mov	ebp,DWORD [192+esp]
+	add	eax,DWORD [ebp]
+	mov	esp,DWORD [204+esp]
+	add	esi,DWORD [4+ebp]
+	add	ecx,DWORD [8+ebp]
+	mov	DWORD [ebp],eax
+	add	edx,DWORD [12+ebp]
+	mov	DWORD [4+ebp],esi
+	add	edi,DWORD [16+ebp]
+	mov	DWORD [8+ebp],ecx
+	mov	DWORD [12+ebp],edx
+	mov	DWORD [16+ebp],edi
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$K_XX_XX:
+dd	1518500249,1518500249,1518500249,1518500249
+dd	1859775393,1859775393,1859775393,1859775393
+dd	2400959708,2400959708,2400959708,2400959708
+dd	3395469782,3395469782,3395469782,3395469782
+dd	66051,67438087,134810123,202182159
+db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+db	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+db	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+db	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha1-armv4-large-linux.S b/gen/bcm/sha1-armv4-large-linux.S
new file mode 100644
index 0000000..323e6e6
--- /dev/null
+++ b/gen/bcm/sha1-armv4-large-linux.S
@@ -0,0 +1,1481 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.globl	sha1_block_data_order_nohw
+.hidden	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
+
+.align	5
+sha1_block_data_order_nohw:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	ldmia	r0,{r3,r4,r5,r6,r7}
+.Lloop:
+	ldr	r8,.LK_00_19
+	mov	r14,sp
+	sub	sp,sp,#15*4
+	mov	r5,r5,ror#30
+	mov	r6,r6,ror#30
+	mov	r7,r7,ror#30		@ [6]
+.L_00_15:
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r3,r10,ror#2
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r7,r10,ror#2
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r6,r10,ror#2
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r5,r10,ror#2
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+
+	ldr	r8,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r4,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp			@ preserve carry
+#endif
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	r8,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r4,r10,ror#2					@ F_xx_xx
+	and	r11,r5,r6					@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
+	add	r7,r7,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+	and	r11,r4,r5					@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
+	add	r6,r6,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+	and	r11,r3,r4					@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
+	add	r5,r5,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+	and	r11,r7,r3					@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
+	add	r4,r4,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+	and	r11,r6,r7					@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
+	add	r3,r3,r11,ror#2
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	r8,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	r0,{r8,r9,r10,r11,r12}
+	add	r3,r8,r3
+	add	r4,r9,r4
+	add	r5,r10,r5,ror#2
+	add	r6,r11,r6,ror#2
+	add	r7,r12,r7,ror#2
+	stmia	r0,{r3,r4,r5,r6,r7}
+	teq	r1,r2
+	bne	.Lloop			@ [+18], total 1307
+
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+
+.align	5
+.LK_00_19:.word	0x5a827999
+.LK_20_39:.word	0x6ed9eba1
+.LK_40_59:.word	0x8f1bbcdc
+.LK_60_79:.word	0xca62c1d6
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	5
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	sha1_block_data_order_neon
+.hidden	sha1_block_data_order_neon
+.type	sha1_block_data_order_neon,%function
+.align	4
+sha1_block_data_order_neon:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	@ dmb				@ errata #451034 on early Cortex A8
+	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
+	mov	r14,sp
+	sub	r12,sp,#64
+	adr	r8,.LK_00_19
+	bic	r12,r12,#15		@ align for 128-bit stores
+
+	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context
+	mov	sp,r12		@ alloca
+
+	vld1.8	{q0,q1},[r1]!	@ handles unaligned
+	veor	q15,q15,q15
+	vld1.8	{q2,q3},[r1]!
+	vld1.32	{d28[],d29[]},[r8,:32]!	@ load K_00_19
+	vrev32.8	q0,q0		@ yes, even on
+	vrev32.8	q1,q1		@ big-endian...
+	vrev32.8	q2,q2
+	vadd.i32	q8,q0,q14
+	vrev32.8	q3,q3
+	vadd.i32	q9,q1,q14
+	vst1.32	{q8},[r12,:128]!
+	vadd.i32	q10,q2,q14
+	vst1.32	{q9},[r12,:128]!
+	vst1.32	{q10},[r12,:128]!
+	ldr	r9,[sp]			@ big RAW stall
+
+.Loop_neon:
+	vext.8	q8,q0,q1,#8
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	and	r11,r5,r4
+	vadd.i32	q13,q3,q14
+	ldr	r9,[sp,#4]
+	add	r7,r7,r3,ror#27
+	vext.8	q12,q3,q15,#4
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q8,q8,q0
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	veor	q12,q12,q2
+	and	r11,r4,r3
+	ldr	r9,[sp,#8]
+	veor	q12,q12,q8
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	vadd.i32	q8,q12,q12
+	and	r11,r3,r7
+	ldr	r9,[sp,#12]
+	vsri.32	q8,q12,#31
+	add	r5,r5,r6,ror#27
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vshr.u32	q12,q13,#30
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	vshl.u32	q13,q13,#2
+	add	r4,r4,r9
+	and	r11,r7,r6
+	veor	q8,q8,q12
+	ldr	r9,[sp,#16]
+	add	r4,r4,r5,ror#27
+	veor	q8,q8,q13
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q9,q1,q2,#8
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	vadd.i32	q13,q8,q14
+	ldr	r9,[sp,#20]
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r4,ror#27
+	vext.8	q12,q8,q15,#4
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	veor	q9,q9,q1
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	veor	q12,q12,q3
+	and	r11,r5,r4
+	ldr	r9,[sp,#24]
+	veor	q12,q12,q9
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	vadd.i32	q9,q12,q12
+	and	r11,r4,r3
+	ldr	r9,[sp,#28]
+	vsri.32	q9,q12,#31
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vshr.u32	q12,q13,#30
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	vshl.u32	q13,q13,#2
+	add	r5,r5,r9
+	and	r11,r3,r7
+	veor	q9,q9,q12
+	ldr	r9,[sp,#32]
+	add	r5,r5,r6,ror#27
+	veor	q9,q9,q13
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q10,q2,q3,#8
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	vadd.i32	q13,q9,q14
+	ldr	r9,[sp,#36]
+	add	r4,r4,r5,ror#27
+	vext.8	q12,q9,q15,#4
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q10,q10,q2
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	veor	q12,q12,q8
+	and	r11,r6,r5
+	ldr	r9,[sp,#40]
+	veor	q12,q12,q10
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	vadd.i32	q10,q12,q12
+	and	r11,r5,r4
+	ldr	r9,[sp,#44]
+	vsri.32	q10,q12,#31
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	vshr.u32	q12,q13,#30
+	add	r7,r7,r11
+	bic	r10,r5,r3
+	vshl.u32	q13,q13,#2
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q10,q10,q12
+	ldr	r9,[sp,#48]
+	add	r6,r6,r7,ror#27
+	veor	q10,q10,q13
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q11,q3,q8,#8
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vadd.i32	q13,q10,q14
+	ldr	r9,[sp,#52]
+	add	r5,r5,r6,ror#27
+	vext.8	q12,q10,q15,#4
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q11,q11,q3
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	veor	q12,q12,q9
+	and	r11,r7,r6
+	ldr	r9,[sp,#56]
+	veor	q12,q12,q11
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	vadd.i32	q11,q12,q12
+	and	r11,r6,r5
+	ldr	r9,[sp,#60]
+	vsri.32	q11,q12,#31
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	vshr.u32	q12,q13,#30
+	add	r3,r3,r11
+	bic	r10,r6,r4
+	vshl.u32	q13,q13,#2
+	add	r7,r7,r9
+	and	r11,r5,r4
+	veor	q11,q11,q12
+	ldr	r9,[sp,#0]
+	add	r7,r7,r3,ror#27
+	veor	q11,q11,q13
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q10,q11,#8
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q0,q0,q8
+	ldr	r9,[sp,#4]
+	add	r6,r6,r7,ror#27
+	veor	q0,q0,q1
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vadd.i32	q13,q11,q14
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	veor	q12,q12,q0
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vsli.32	q0,q12,#2
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	ldr	r9,[sp,#12]
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	ldr	r9,[sp,#16]
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q11,q0,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q1,q1,q2
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q0,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q1
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r4
+	vshr.u32	q1,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q1,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q0,q1,#8
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	veor	q2,q2,q3
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vadd.i32	q13,q1,q14
+	eor	r10,r4,r6
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r7,r7,r9
+	veor	q12,q12,q2
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r5
+	vshr.u32	q2,q12,#30
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	vsli.32	q2,q12,#2
+	add	r6,r6,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	veor	q3,q3,q8
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r6
+	vshr.u32	q3,q12,#30
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vsli.32	q3,q12,#2
+	add	r7,r7,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q2,q3,#8
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#4]
+	veor	q8,q8,q0
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	veor	q8,q8,q9
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	veor	q12,q12,q8
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r7
+	vshr.u32	q8,q12,#30
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	vsli.32	q8,q12,#2
+	add	r3,r3,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q3,q8,#8
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#20]
+	veor	q9,q9,q1
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	veor	q9,q9,q10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vadd.i32	q13,q8,q14
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	veor	q12,q12,q9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r3
+	vshr.u32	q9,q12,#30
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	vsli.32	q9,q12,#2
+	add	r4,r4,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q8,q9,#8
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#36]
+	veor	q10,q10,q2
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	veor	q10,q10,q11
+	add	r7,r7,r10
+	and	r11,r11,r4
+	vadd.i32	q13,q9,q14
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q12,q12,q10
+	add	r6,r6,r9
+	and	r10,r4,r5
+	vshr.u32	q10,q12,#30
+	ldr	r9,[sp,#40]
+	add	r6,r6,r7,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	vsli.32	q10,q12,#2
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#44]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#48]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q9,q10,#8
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#52]
+	veor	q11,q11,q3
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	veor	q11,q11,q0
+	add	r3,r3,r10
+	and	r11,r11,r5
+	vadd.i32	q13,q10,q14
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	veor	q12,q12,q11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	vshr.u32	q11,q12,#30
+	ldr	r9,[sp,#56]
+	add	r7,r7,r3,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	vsli.32	q11,q12,#2
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#60]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#0]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q10,q11,#8
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#4]
+	veor	q0,q0,q8
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	veor	q0,q0,q1
+	add	r4,r4,r10
+	and	r11,r11,r6
+	vadd.i32	q13,q11,q14
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q12,q12,q0
+	add	r3,r3,r9
+	and	r10,r6,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r3,r3,r4,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	vsli.32	q0,q12,#2
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#12]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#16]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q11,q0,#8
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	veor	q1,q1,q2
+	add	r5,r5,r10
+	and	r11,r11,r7
+	vadd.i32	q13,q0,q14
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q12,q12,q1
+	add	r4,r4,r9
+	and	r10,r7,r3
+	vshr.u32	q1,q12,#30
+	ldr	r9,[sp,#24]
+	add	r4,r4,r5,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	vsli.32	q1,q12,#2
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#28]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#32]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q0,q1,#8
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	veor	q2,q2,q3
+	add	r6,r6,r10
+	and	r11,r11,r3
+	vadd.i32	q13,q1,q14
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	veor	q12,q12,q2
+	add	r5,r5,r9
+	and	r10,r3,r4
+	vshr.u32	q2,q12,#30
+	ldr	r9,[sp,#40]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	vsli.32	q2,q12,#2
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#44]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#48]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q3,q3,q8
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r4
+	vshr.u32	q3,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q3,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	teq	r1,r2
+	sub	r8,r8,#16
+	it	eq
+	subeq	r1,r1,#64
+	vld1.8	{q0,q1},[r1]!
+	ldr	r9,[sp,#4]
+	eor	r11,r10,r6
+	vld1.8	{q2,q3},[r1]!
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vrev32.8	q0,q0
+	add	r7,r7,r9
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vrev32.8	q1,q1
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	vadd.i32	q8,q0,q14
+	ldr	r9,[sp,#20]
+	eor	r11,r10,r7
+	vst1.32	{q8},[r12,:128]!
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vrev32.8	q2,q2
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	vadd.i32	q9,q1,q14
+	ldr	r9,[sp,#36]
+	eor	r11,r10,r3
+	vst1.32	{q9},[r12,:128]!
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vrev32.8	q3,q3
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	vadd.i32	q10,q2,q14
+	ldr	r9,[sp,#52]
+	eor	r11,r10,r4
+	vst1.32	{q10},[r12,:128]!
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context
+	add	r3,r3,r9
+	ldr	r9,[r0,#16]
+	add	r4,r4,r10
+	add	r5,r5,r11
+	add	r6,r6,r12
+	it	eq
+	moveq	sp,r14
+	add	r7,r7,r9
+	it	ne
+	ldrne	r9,[sp]
+	stmia	r0,{r3,r4,r5,r6,r7}
+	itt	ne
+	addne	r12,sp,#3*16
+	bne	.Loop_neon
+
+	@ vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
+# endif
+
+.globl	sha1_block_data_order_hw
+.hidden	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
+.align	5
+sha1_block_data_order_hw:
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+
+	veor	q1,q1,q1
+	adr	r3,.LK_00_19
+	vld1.32	{q0},[r0]!
+	vld1.32	{d2[0]},[r0]
+	sub	r0,r0,#16
+	vld1.32	{d16[],d17[]},[r3,:32]!
+	vld1.32	{d18[],d19[]},[r3,:32]!
+	vld1.32	{d20[],d21[]},[r3,:32]!
+	vld1.32	{d22[],d23[]},[r3,:32]
+
+.Loop_v8:
+	vld1.8	{q4,q5},[r1]!
+	vld1.8	{q6,q7},[r1]!
+	vrev32.8	q4,q4
+	vrev32.8	q5,q5
+
+	vadd.i32	q12,q8,q4
+	vrev32.8	q6,q6
+	vmov	q14,q0	@ offload
+	subs	r2,r2,#1
+
+	vadd.i32	q13,q8,q5
+	vrev32.8	q7,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 0
+	INST(0x68,0x0c,0x02,0xe2)	@ sha1c q0,q1,q12
+	vadd.i32	q12,q8,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 1
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q8,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 2
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q8,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 3
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 4
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q9,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 5
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 6
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q9,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 7
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 8
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 9
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q10,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 10
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 11
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q10,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 12
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 13
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q11,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 14
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q11,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 15
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 16
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q11,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 17
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q7
+
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 18
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 19
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+
+	vadd.i32	q1,q1,q2
+	vadd.i32	q0,q0,q14
+	bne	.Loop_v8
+
+	vst1.32	{q0},[r0]!
+	vst1.32	{d2[0]},[r0]
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	bx	lr					@ bx lr
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha1-armv8-apple.S b/gen/bcm/sha1-armv8-apple.S
new file mode 100644
index 0000000..8f84774
--- /dev/null
+++ b/gen/bcm/sha1-armv8-apple.S
@@ -0,0 +1,1218 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_sha1_block_data_order_nohw
+.private_extern	_sha1_block_data_order_nohw
+
+.align	6
+_sha1_block_data_order_nohw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+
+.globl	_sha1_block_data_order_hw
+.private_extern	_sha1_block_data_order_hw
+
+.align	6
+_sha1_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,Lconst@PAGE
+	add	x4,x4,Lconst@PAGEOFF
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b
+.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.section	__TEXT,__const
+.align	6
+Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha1-armv8-linux.S b/gen/bcm/sha1-armv8-linux.S
new file mode 100644
index 0000000..f2df2dd
--- /dev/null
+++ b/gen/bcm/sha1-armv8-linux.S
@@ -0,0 +1,1218 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	sha1_block_data_order_nohw
+.hidden	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
+.align	6
+sha1_block_data_order_nohw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+.Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+.globl	sha1_block_data_order_hw
+.hidden	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
+.align	6
+sha1_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,.Lconst
+	add	x4,x4,:lo12:.Lconst
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b
+.inst	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.section	.rodata
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha1-armv8-win.S b/gen/bcm/sha1-armv8-win.S
new file mode 100644
index 0000000..f8c8b86
--- /dev/null
+++ b/gen/bcm/sha1-armv8-win.S
@@ -0,0 +1,1222 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	sha1_block_data_order_nohw
+
+.def sha1_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha1_block_data_order_nohw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+
+.globl	sha1_block_data_order_hw
+
+.def sha1_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha1_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,Lconst
+	add	x4,x4,:lo12:Lconst
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b
+.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.section	.rodata
+.align	6
+Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha1-x86_64-apple.S b/gen/bcm/sha1-x86_64-apple.S
new file mode 100644
index 0000000..a1ea1e6
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-apple.S
@@ -0,0 +1,5450 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_sha1_block_data_order_nohw
+.private_extern _sha1_block_data_order_nohw
+
+.p2align	4
+_sha1_block_data_order_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%rax,64(%rsp)
+
+L$prologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	4(%r9),%ebp
+	movl	%r12d,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ecx
+	bswapl	%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	8(%r9),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	12(%r9),%edx
+	movl	%edi,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	16(%r9),%ebp
+	movl	%esi,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	20(%r9),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ecx
+	bswapl	%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	24(%r9),%edx
+	movl	%r12d,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ecx
+	bswapl	%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%r14,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	28(%r9),%ebp
+	movl	%r11d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	32(%r9),%r14d
+	movl	%edi,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	36(%r9),%edx
+	movl	%esi,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%r14,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	40(%r9),%ebp
+	movl	%r13d,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ecx
+	bswapl	%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rdx,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	44(%r9),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ecx
+	bswapl	%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	48(%r9),%edx
+	movl	%r11d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%r14,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	52(%r9),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rdx,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	56(%r9),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rbp,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	60(%r9),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%edi,%ecx
+	bswapl	%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%r14,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%esi,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	roll	$30,%edi
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	roll	$1,%ebp
+	addl	%eax,%r13d
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	roll	$30,%esi
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	roll	$1,%r14d
+	addl	%eax,%r12d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	roll	$30,%r13d
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	roll	$1,%edx
+	addl	%eax,%r11d
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	roll	$30,%r12d
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	roll	$1,%ebp
+	addl	%eax,%edi
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%edi,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	roll	$30,%r11d
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$1,%r14d
+	addl	%eax,%esi
+	xorl	20(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,16(%rsp)
+	movl	%esi,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,20(%rsp)
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,24(%rsp)
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,32(%rsp)
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,36(%rsp)
+	movl	%esi,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,40(%rsp)
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	1859775393(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,44(%rsp)
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,48(%rsp)
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,52(%rsp)
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	1859775393(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,56(%rsp)
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	0(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,60(%rsp)
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%r14d
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	4(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,0(%rsp)
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	leal	1859775393(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	8(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,4(%rsp)
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	12(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,8(%rsp)
+	movl	%edi,%ecx
+	xorl	20(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%r14d
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,12(%rsp)
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,16(%rsp)
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	24(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,20(%rsp)
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	28(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,24(%rsp)
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	32(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,28(%rsp)
+	movl	%edi,%ecx
+	xorl	40(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	36(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,32(%rsp)
+	movl	%r12d,%ebx
+	xorl	44(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,36(%rsp)
+	movl	%r11d,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,40(%rsp)
+	movl	%edi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	48(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,44(%rsp)
+	movl	%esi,%ebx
+	xorl	56(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	52(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,48(%rsp)
+	movl	%r13d,%ebx
+	xorl	60(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	56(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,52(%rsp)
+	movl	%r12d,%ebx
+	xorl	0(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%ebp
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	60(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,56(%rsp)
+	movl	%r11d,%ebx
+	xorl	4(%rsp),%r14d
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%r14d
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,60(%rsp)
+	movl	%edi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	leal	-1894007588(%r14,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%edx
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%ebp
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	8(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ebx
+	xorl	16(%rsp),%r14d
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%r14d
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%r14d
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	12(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ebx
+	xorl	20(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%edx
+	leal	-1894007588(%r14,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%edx
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	16(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ebx
+	xorl	24(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%ebp
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	20(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ebx
+	xorl	28(%rsp),%r14d
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%r14d
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ebx
+	xorl	32(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	leal	-1894007588(%r14,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%edx
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%ebp
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	32(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ebx
+	xorl	40(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	36(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	40(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ebx
+	xorl	48(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	44(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ebx
+	xorl	52(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	56(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	60(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%edi,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	leal	-899497514(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	20(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,16(%rsp)
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	-899497514(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,20(%rsp)
+	movl	%edi,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,24(%rsp)
+	movl	%esi,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	-899497514(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%r11d,%eax
+
+	movl	%edi,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	-899497514(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%edi,%eax
+
+	movl	%esi,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%esi,%eax
+
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r11d,%eax
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	L$loop
+
+	movq	64(%rsp),%rsi
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue:
+	ret
+
+
+.globl	_sha1_block_data_order_hw
+.private_extern _sha1_block_data_order_hw
+
+.p2align	5
+_sha1_block_data_order_hw:
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movd	16(%rdi),%xmm1
+	movdqa	K_XX_XX+160(%rip),%xmm3
+
+	movdqu	(%rsi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%rsi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+	movdqa	%xmm1,%xmm9
+.byte	102,15,56,0,251
+	jmp	L$oop_shaext
+
+.p2align	4
+L$oop_shaext:
+	decq	%rdx
+	leaq	64(%rsi),%r8
+	paddd	%xmm4,%xmm1
+	cmovneq	%r8,%rsi
+	prefetcht0	512(%rsi)
+	movdqa	%xmm0,%xmm8
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%rsi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%rsi),%xmm5
+.byte	102,15,56,0,227
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,235
+
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,243
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	65,15,56,200,201
+.byte	102,15,56,0,251
+
+	paddd	%xmm8,%xmm0
+	movdqa	%xmm1,%xmm9
+
+	jnz	L$oop_shaext
+
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%rdi)
+	movd	%xmm1,16(%rdi)
+	ret
+
+
+.globl	_sha1_block_data_order_ssse3
+.private_extern _sha1_block_data_order_ssse3
+
+.p2align	4
+_sha1_block_data_order_ssse3:
+
+_CET_ENDBR
+	movq	%rsp,%r11
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	leaq	-64(%rsp),%rsp
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	addq	$64,%r9
+	paddd	%xmm9,%xmm0
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	L$oop_ssse3
+.p2align	4
+L$oop_ssse3:
+	rorl	$2,%ebx
+	pshufd	$238,%xmm0,%xmm4
+	xorl	%edx,%esi
+	movdqa	%xmm3,%xmm8
+	paddd	%xmm3,%xmm9
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	psrldq	$4,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm8
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	pxor	%xmm8,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	movdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm10
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	movdqa	%xmm4,%xmm8
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	psrld	$31,%xmm8
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm9
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm8,%xmm4
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	xorl	%ebp,%edx
+	movdqa	-64(%r14),%xmm10
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	pxor	%xmm9,%xmm4
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%ebp,%esi
+	movdqa	%xmm4,%xmm9
+	paddd	%xmm4,%xmm10
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm9
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	pxor	%xmm9,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm10,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm9
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	psrld	$31,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm10
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	por	%xmm9,%xmm5
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	xorl	%eax,%ebp
+	movdqa	-32(%r14),%xmm8
+	roll	$5,%edx
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	pxor	%xmm10,%xmm5
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%esi
+	movdqa	%xmm5,%xmm10
+	paddd	%xmm5,%xmm8
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm10
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm10
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	pxor	%xmm10,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm8,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm9
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm10
+	xorl	%edx,%esi
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	psrld	$31,%xmm10
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm9,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	por	%xmm10,%xmm6
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	xorl	%ebx,%eax
+	movdqa	-32(%r14),%xmm9
+	roll	$5,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	pxor	%xmm8,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%esi
+	movdqa	%xmm6,%xmm8
+	paddd	%xmm6,%xmm9
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm8
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm8
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	pxor	%xmm8,%xmm7
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	movdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm10
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm8
+	xorl	%ebp,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	psrld	$31,%xmm8
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	%xmm10,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm8,%xmm7
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	xorl	%ecx,%ebx
+	movdqa	-32(%r14),%xmm10
+	roll	$5,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	pxor	%xmm9,%xmm7
+	pshufd	$238,%xmm6,%xmm9
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	punpcklqdq	%xmm7,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%edi
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%eax
+	paddd	%xmm7,%xmm10
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	movdqa	%xmm0,%xmm9
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	movdqa	%xmm10,48(%rsp)
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	pslld	$2,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	por	%xmm9,%xmm0
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	pshufd	$238,%xmm7,%xmm10
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm0,%xmm10
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm8
+	addl	%eax,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm8,0(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	24(%rsp),%ecx
+	pslld	$2,%xmm1
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm10,%xmm1
+	addl	%edx,%ecx
+	addl	28(%rsp),%ebx
+	pshufd	$238,%xmm0,%xmm8
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	punpcklqdq	%xmm1,%xmm8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	0(%r14),%xmm10
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm9
+	addl	%ebx,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	movdqa	%xmm9,16(%rsp)
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	40(%rsp),%edx
+	pslld	$2,%xmm2
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	por	%xmm8,%xmm2
+	addl	%ebp,%edx
+	addl	44(%rsp),%ecx
+	pshufd	$238,%xmm1,%xmm9
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	punpcklqdq	%xmm2,%xmm9
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	movdqa	%xmm10,%xmm8
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm10
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	movdqa	%xmm10,32(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	56(%rsp),%ebp
+	pslld	$2,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	psrld	$30,%xmm9
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	por	%xmm9,%xmm3
+	addl	%eax,%ebp
+	addl	60(%rsp),%edx
+	pshufd	$238,%xmm2,%xmm10
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pxor	%xmm0,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	punpcklqdq	%xmm3,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebp
+	paddd	%xmm3,%xmm8
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	movdqa	%xmm8,48(%rsp)
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	8(%rsp),%eax
+	pslld	$2,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	por	%xmm10,%xmm4
+	addl	%ebx,%eax
+	addl	12(%rsp),%ebp
+	pshufd	$238,%xmm3,%xmm8
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm1,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	punpcklqdq	%xmm4,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm9
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	movdqa	%xmm9,0(%rsp)
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	24(%rsp),%ebx
+	pslld	$2,%xmm5
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	por	%xmm8,%xmm5
+	addl	%ecx,%ebx
+	addl	28(%rsp),%eax
+	pshufd	$238,%xmm4,%xmm9
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm2,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	punpcklqdq	%xmm5,%xmm9
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	pxor	%xmm7,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%edi
+	paddd	%xmm5,%xmm10
+	xorl	%ecx,%ebx
+	pxor	%xmm9,%xmm6
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movdqa	%xmm6,%xmm9
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	movdqa	%xmm10,16(%rsp)
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	pslld	$2,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	psrld	$30,%xmm9
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	por	%xmm9,%xmm6
+	rorl	$7,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	roll	$5,%edx
+	pshufd	$238,%xmm5,%xmm10
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm3,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	punpcklqdq	%xmm6,%xmm10
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	pxor	%xmm0,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	32(%r14),%xmm9
+	xorl	%ecx,%edi
+	paddd	%xmm6,%xmm8
+	xorl	%edx,%ecx
+	pxor	%xmm10,%xmm7
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movdqa	%xmm7,%xmm10
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	movdqa	%xmm8,32(%rsp)
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	pslld	$2,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	por	%xmm10,%xmm7
+	rorl	$7,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	roll	$5,%ebp
+	pshufd	$238,%xmm6,%xmm8
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm4,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	punpcklqdq	%xmm7,%xmm8
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	pxor	%xmm1,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	movdqa	%xmm9,%xmm10
+	xorl	%edx,%edi
+	paddd	%xmm7,%xmm9
+	xorl	%ebp,%edx
+	pxor	%xmm8,%xmm0
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movdqa	%xmm0,%xmm8
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	movdqa	%xmm9,48(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	pslld	$2,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	por	%xmm8,%xmm0
+	rorl	$7,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	pshufd	$238,%xmm7,%xmm9
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	punpcklqdq	%xmm0,%xmm9
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	pxor	%xmm2,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm8
+	xorl	%ebp,%edi
+	paddd	%xmm0,%xmm10
+	xorl	%eax,%ebp
+	pxor	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movdqa	%xmm1,%xmm9
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	movdqa	%xmm10,0(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	pslld	$2,%xmm1
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	por	%xmm9,%xmm1
+	rorl	$7,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	pshufd	$238,%xmm0,%xmm10
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	punpcklqdq	%xmm1,%xmm10
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	pxor	%xmm3,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm9
+	xorl	%eax,%edi
+	paddd	%xmm1,%xmm8
+	xorl	%ebx,%eax
+	pxor	%xmm10,%xmm2
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movdqa	%xmm2,%xmm10
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	movdqa	%xmm8,16(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	pslld	$2,%xmm2
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	por	%xmm10,%xmm2
+	rorl	$7,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	roll	$5,%ecx
+	pshufd	$238,%xmm1,%xmm8
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm2,%xmm8
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm9
+	addl	%eax,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm9,32(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	56(%rsp),%ecx
+	pslld	$2,%xmm3
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm8,%xmm3
+	addl	%edx,%ecx
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm3,%xmm10
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	%xmm10,48(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	L$done_ssse3
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+.byte	102,15,56,0,206
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	paddd	%xmm9,%xmm0
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	movdqa	%xmm0,0(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	psubd	%xmm9,%xmm0
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+.byte	102,15,56,0,214
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	paddd	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	movdqa	%xmm1,16(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	psubd	%xmm9,%xmm1
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+.byte	102,15,56,0,222
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	paddd	%xmm9,%xmm2
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	movdqa	%xmm2,32(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	psubd	%xmm9,%xmm2
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	L$oop_ssse3
+
+.p2align	4
+L$done_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+
+	movq	-32(%r11),%r13
+
+	movq	-24(%r11),%r12
+
+	movq	-16(%r11),%rbp
+
+	movq	-8(%r11),%rbx
+
+	leaq	(%r11),%rsp
+
+L$epilogue_ssse3:
+	ret
+
+
+.globl	_sha1_block_data_order_avx
+.private_extern _sha1_block_data_order_avx
+
+.p2align	4
+_sha1_block_data_order_avx:
+
+_CET_ENDBR
+	movq	%rsp,%r11
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	leaq	-64(%rsp),%rsp
+	vzeroupper
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm11,%xmm0,%xmm4
+	vpaddd	%xmm11,%xmm1,%xmm5
+	vpaddd	%xmm11,%xmm2,%xmm6
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	jmp	L$oop_avx
+.p2align	4
+L$oop_avx:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm8
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm2,%xmm8,%xmm8
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	vpxor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm8
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm10
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm4,%xmm4
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	vpxor	%xmm10,%xmm4,%xmm4
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	vpaddd	%xmm4,%xmm11,%xmm9
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	vpxor	%xmm8,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm10
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm5,%xmm5
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	vpxor	%xmm10,%xmm5,%xmm5
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vmovdqa	-32(%r14),%xmm11
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	vpaddd	%xmm5,%xmm11,%xmm9
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm8
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm8,%xmm8
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm8
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm10
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm6,%xmm6
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	vpxor	%xmm10,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	vpaddd	%xmm6,%xmm11,%xmm9
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm8
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm8,%xmm8
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm8
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpslldq	$12,%xmm7,%xmm10
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm7,%xmm11,%xmm9
+	addl	%esi,%edx
+	andl	%eax,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm1,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm1,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	vmovdqa	0(%r14),%xmm11
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm8,%xmm2,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpor	%xmm8,%xmm2,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	vpaddd	%xmm3,%xmm11,%xmm9
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm8,%xmm4,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm6,%xmm5,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	vpaddd	%xmm4,%xmm11,%xmm9
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	28(%rsp),%eax
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	vpaddd	%xmm5,%xmm11,%xmm9
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	vpsrld	$30,%xmm6,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	vpaddd	%xmm6,%xmm11,%xmm9
+	vmovdqa	32(%r14),%xmm11
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	vpsrld	$30,%xmm7,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	vpaddd	%xmm7,%xmm11,%xmm9
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	vpxor	%xmm2,%xmm1,%xmm1
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	vpor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	vpxor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	vpor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	L$done_avx
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm11,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm11,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm11,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	L$oop_avx
+
+.p2align	4
+L$done_avx:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroupper
+
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+
+	movq	-32(%r11),%r13
+
+	movq	-24(%r11),%r12
+
+	movq	-16(%r11),%rbp
+
+	movq	-8(%r11),%rbx
+
+	leaq	(%r11),%rsp
+
+L$epilogue_avx:
+	ret
+
+
+.globl	_sha1_block_data_order_avx2
+.private_extern _sha1_block_data_order_avx2
+
+.p2align	4
+_sha1_block_data_order_avx2:
+
+_CET_ENDBR
+	movq	%rsp,%r11
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	vzeroupper
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	leaq	-640(%rsp),%rsp
+	shlq	$6,%r10
+	leaq	64(%r9),%r13
+	andq	$-128,%rsp
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+	movl	4(%r8),%ebp
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	16(%r8),%esi
+	vmovdqu	64(%r14),%ymm6
+
+	vmovdqu	(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	leaq	64(%r9),%r9
+	vinserti128	$1,(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm3,%ymm3
+
+	vpaddd	%ymm11,%ymm0,%ymm4
+	vpaddd	%ymm11,%ymm1,%ymm5
+	vmovdqu	%ymm4,0(%rsp)
+	vpaddd	%ymm11,%ymm2,%ymm6
+	vmovdqu	%ymm5,32(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	vmovdqu	%ymm6,64(%rsp)
+	vmovdqu	%ymm7,96(%rsp)
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	vpsrldq	$4,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$31,%ymm4,%ymm8
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	vpxor	%ymm10,%ymm4,%ymm4
+	vpaddd	%ymm11,%ymm4,%ymm9
+	vmovdqu	%ymm9,128(%rsp)
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	vpsrldq	$4,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm5,%ymm9
+	vmovdqu	%ymm9,160(%rsp)
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	vpsrldq	$4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$31,%ymm6,%ymm8
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm6,%ymm9
+	vmovdqu	%ymm9,192(%rsp)
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	vpsrldq	$4,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm7,%ymm8
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm7,%ymm7
+	vpaddd	%ymm11,%ymm7,%ymm9
+	vmovdqu	%ymm9,224(%rsp)
+	leaq	128(%rsp),%r13
+	jmp	L$oop_avx2
+.p2align	5
+L$oop_avx2:
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	jmp	L$align32_1
+.p2align	5
+L$align32_1:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpxor	%ymm1,%ymm0,%ymm0
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vpor	%ymm8,%ymm0,%ymm0
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	vmovdqu	%ymm9,256(%rsp)
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpxor	%ymm2,%ymm1,%ymm1
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vpor	%ymm8,%ymm1,%ymm1
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vmovdqu	%ymm9,288(%rsp)
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	vpxor	%ymm3,%ymm2,%ymm2
+	vmovdqu	0(%r14),%ymm11
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vpor	%ymm8,%ymm2,%ymm2
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vmovdqu	%ymm9,320(%rsp)
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	vpxor	%ymm4,%ymm3,%ymm3
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	vpor	%ymm8,%ymm3,%ymm3
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vmovdqu	%ymm9,352(%rsp)
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpalignr	$8,%ymm2,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	vpsrld	$30,%ymm4,%ymm8
+	vpslld	$2,%ymm4,%ymm4
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	vmovdqu	%ymm9,384(%rsp)
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpalignr	$8,%ymm3,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm6,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpxor	%ymm8,%ymm5,%ymm5
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	vpsrld	$30,%ymm5,%ymm8
+	vpslld	$2,%ymm5,%ymm5
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vpor	%ymm8,%ymm5,%ymm5
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	vmovdqu	%ymm9,416(%rsp)
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	vpxor	%ymm8,%ymm6,%ymm6
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	vpsrld	$30,%ymm6,%ymm8
+	vpslld	$2,%ymm6,%ymm6
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpor	%ymm8,%ymm6,%ymm6
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	vmovdqu	%ymm9,448(%rsp)
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm5,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm0,%ymm7,%ymm7
+	vmovdqu	32(%r14),%ymm11
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpxor	%ymm8,%ymm7,%ymm7
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	vpsrld	$30,%ymm7,%ymm8
+	vpslld	$2,%ymm7,%ymm7
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpor	%ymm8,%ymm7,%ymm7
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	vmovdqu	%ymm9,480(%rsp)
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	jmp	L$align32_2
+.p2align	5
+L$align32_2:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	vpxor	%ymm1,%ymm0,%ymm0
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	vpor	%ymm8,%ymm0,%ymm0
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	vmovdqu	%ymm9,512(%rsp)
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm2,%ymm1,%ymm1
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	vpor	%ymm8,%ymm1,%ymm1
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	vmovdqu	%ymm9,544(%rsp)
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	vpxor	%ymm3,%ymm2,%ymm2
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	vpor	%ymm8,%ymm2,%ymm2
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,576(%rsp)
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	vpxor	%ymm4,%ymm3,%ymm3
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	vpor	%ymm8,%ymm3,%ymm3
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vmovdqu	%ymm9,608(%rsp)
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%r9),%r13
+	leaq	128(%r9),%rdi
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	je	L$done_avx2
+	vmovdqu	64(%r14),%ymm6
+	cmpq	%r10,%rdi
+	ja	L$ast_avx2
+
+	vmovdqu	-64(%rdi),%xmm0
+	vmovdqu	-48(%rdi),%xmm1
+	vmovdqu	-32(%rdi),%xmm2
+	vmovdqu	-16(%rdi),%xmm3
+	vinserti128	$1,0(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	jmp	L$ast_avx2
+
+.p2align	5
+L$ast_avx2:
+	leaq	128+16(%rsp),%r13
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	subq	$-128,%r9
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm0,%ymm0
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm11,%ymm0,%ymm8
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vmovdqu	%ymm8,0(%rsp)
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	vmovdqu	%ymm9,32(%rsp)
+	vpshufb	%ymm6,%ymm3,%ymm3
+	vpaddd	%ymm11,%ymm2,%ymm6
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	jmp	L$align32_3
+.p2align	5
+L$align32_3:
+	vmovdqu	%ymm6,64(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vmovdqu	%ymm7,96(%rsp)
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	vpsrldq	$4,%ymm3,%ymm8
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	vpsrld	$31,%ymm4,%ymm8
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm10,%ymm4,%ymm4
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,128(%rsp)
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrldq	$4,%ymm4,%ymm8
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	vpxor	%ymm10,%ymm5,%ymm5
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vmovdqu	%ymm9,160(%rsp)
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpsrldq	$4,%ymm5,%ymm8
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm8,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpsrld	$31,%ymm6,%ymm8
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	vpxor	%ymm10,%ymm6,%ymm6
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vmovdqu	%ymm9,192(%rsp)
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpsrldq	$4,%ymm6,%ymm8
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm8,%ymm7,%ymm7
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	vpsrld	$31,%ymm7,%ymm8
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	vpxor	%ymm10,%ymm7,%ymm7
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vmovdqu	%ymm9,224(%rsp)
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%rsp),%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	jbe	L$oop_avx2
+
+L$done_avx2:
+	vzeroupper
+	movq	-40(%r11),%r14
+
+	movq	-32(%r11),%r13
+
+	movq	-24(%r11),%r12
+
+	movq	-16(%r11),%rbp
+
+	movq	-8(%r11),%rbx
+
+	leaq	(%r11),%rsp
+
+L$epilogue_avx2:
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
+.text	
+#endif
diff --git a/gen/bcm/sha1-x86_64-linux.S b/gen/bcm/sha1-x86_64-linux.S
new file mode 100644
index 0000000..39d9ad3
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-linux.S
@@ -0,0 +1,5450 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	sha1_block_data_order_nohw
+.hidden sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,@function
+.align	16
+sha1_block_data_order_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%rax,64(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
+.Lprologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	4(%r9),%ebp
+	movl	%r12d,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ecx
+	bswapl	%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	8(%r9),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	12(%r9),%edx
+	movl	%edi,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	16(%r9),%ebp
+	movl	%esi,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	20(%r9),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ecx
+	bswapl	%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	24(%r9),%edx
+	movl	%r12d,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ecx
+	bswapl	%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%r14,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	28(%r9),%ebp
+	movl	%r11d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	32(%r9),%r14d
+	movl	%edi,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	36(%r9),%edx
+	movl	%esi,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%r14,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	40(%r9),%ebp
+	movl	%r13d,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ecx
+	bswapl	%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rdx,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	44(%r9),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ecx
+	bswapl	%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	48(%r9),%edx
+	movl	%r11d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%r14,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	52(%r9),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rdx,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	56(%r9),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rbp,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	60(%r9),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%edi,%ecx
+	bswapl	%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%r14,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%esi,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	roll	$30,%edi
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	roll	$1,%ebp
+	addl	%eax,%r13d
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	roll	$30,%esi
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	roll	$1,%r14d
+	addl	%eax,%r12d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	roll	$30,%r13d
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	roll	$1,%edx
+	addl	%eax,%r11d
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	roll	$30,%r12d
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	roll	$1,%ebp
+	addl	%eax,%edi
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%edi,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	roll	$30,%r11d
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$1,%r14d
+	addl	%eax,%esi
+	xorl	20(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,16(%rsp)
+	movl	%esi,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,20(%rsp)
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,24(%rsp)
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,32(%rsp)
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,36(%rsp)
+	movl	%esi,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,40(%rsp)
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	1859775393(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,44(%rsp)
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,48(%rsp)
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,52(%rsp)
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	1859775393(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,56(%rsp)
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	0(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,60(%rsp)
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%r14d
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	4(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,0(%rsp)
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	leal	1859775393(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	8(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,4(%rsp)
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	12(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,8(%rsp)
+	movl	%edi,%ecx
+	xorl	20(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%r14d
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,12(%rsp)
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,16(%rsp)
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	24(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,20(%rsp)
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	28(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,24(%rsp)
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	32(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,28(%rsp)
+	movl	%edi,%ecx
+	xorl	40(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	36(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,32(%rsp)
+	movl	%r12d,%ebx
+	xorl	44(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,36(%rsp)
+	movl	%r11d,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,40(%rsp)
+	movl	%edi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	48(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,44(%rsp)
+	movl	%esi,%ebx
+	xorl	56(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	52(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,48(%rsp)
+	movl	%r13d,%ebx
+	xorl	60(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	56(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,52(%rsp)
+	movl	%r12d,%ebx
+	xorl	0(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%ebp
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	60(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,56(%rsp)
+	movl	%r11d,%ebx
+	xorl	4(%rsp),%r14d
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%r14d
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,60(%rsp)
+	movl	%edi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	leal	-1894007588(%r14,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%edx
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%ebp
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	8(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ebx
+	xorl	16(%rsp),%r14d
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%r14d
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%r14d
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	12(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ebx
+	xorl	20(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%edx
+	leal	-1894007588(%r14,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%edx
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	16(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ebx
+	xorl	24(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%ebp
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	20(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ebx
+	xorl	28(%rsp),%r14d
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%r14d
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ebx
+	xorl	32(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	leal	-1894007588(%r14,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%edx
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%ebp
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	32(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ebx
+	xorl	40(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	36(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	40(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ebx
+	xorl	48(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	44(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ebx
+	xorl	52(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	56(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	60(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%edi,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	leal	-899497514(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	20(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,16(%rsp)
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	-899497514(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,20(%rsp)
+	movl	%edi,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,24(%rsp)
+	movl	%esi,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	-899497514(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%r11d,%eax
+
+	movl	%edi,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	-899497514(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%edi,%eax
+
+	movl	%esi,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%esi,%eax
+
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r11d,%eax
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	.Lloop
+
+	movq	64(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+.globl	sha1_block_data_order_hw
+.hidden sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,@function
+.align	32
+sha1_block_data_order_hw:
+.cfi_startproc	
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	movd	16(%rdi),%xmm1
+	movdqa	K_XX_XX+160(%rip),%xmm3
+
+	movdqu	(%rsi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%rsi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+	movdqa	%xmm1,%xmm9
+.byte	102,15,56,0,251
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	decq	%rdx
+	leaq	64(%rsi),%r8
+	paddd	%xmm4,%xmm1
+	cmovneq	%r8,%rsi
+	prefetcht0	512(%rsi)
+	movdqa	%xmm0,%xmm8
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%rsi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%rsi),%xmm5
+.byte	102,15,56,0,227
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,235
+
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,243
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	65,15,56,200,201
+.byte	102,15,56,0,251
+
+	paddd	%xmm8,%xmm0
+	movdqa	%xmm1,%xmm9
+
+	jnz	.Loop_shaext
+
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%rdi)
+	movd	%xmm1,16(%rdi)
+	ret
+.cfi_endproc	
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.globl	sha1_block_data_order_ssse3
+.hidden sha1_block_data_order_ssse3
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	leaq	-64(%rsp),%rsp
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	addq	$64,%r9
+	paddd	%xmm9,%xmm0
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	rorl	$2,%ebx
+	pshufd	$238,%xmm0,%xmm4
+	xorl	%edx,%esi
+	movdqa	%xmm3,%xmm8
+	paddd	%xmm3,%xmm9
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	psrldq	$4,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm8
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	pxor	%xmm8,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	movdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm10
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	movdqa	%xmm4,%xmm8
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	psrld	$31,%xmm8
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm9
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm8,%xmm4
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	xorl	%ebp,%edx
+	movdqa	-64(%r14),%xmm10
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	pxor	%xmm9,%xmm4
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%ebp,%esi
+	movdqa	%xmm4,%xmm9
+	paddd	%xmm4,%xmm10
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm9
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	pxor	%xmm9,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm10,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm9
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	psrld	$31,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm10
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	por	%xmm9,%xmm5
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	xorl	%eax,%ebp
+	movdqa	-32(%r14),%xmm8
+	roll	$5,%edx
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	pxor	%xmm10,%xmm5
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%esi
+	movdqa	%xmm5,%xmm10
+	paddd	%xmm5,%xmm8
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm10
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm10
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	pxor	%xmm10,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm8,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm9
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm10
+	xorl	%edx,%esi
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	psrld	$31,%xmm10
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm9,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	por	%xmm10,%xmm6
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	xorl	%ebx,%eax
+	movdqa	-32(%r14),%xmm9
+	roll	$5,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	pxor	%xmm8,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%esi
+	movdqa	%xmm6,%xmm8
+	paddd	%xmm6,%xmm9
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm8
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm8
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	pxor	%xmm8,%xmm7
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	movdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm10
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm8
+	xorl	%ebp,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	psrld	$31,%xmm8
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	%xmm10,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm8,%xmm7
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	xorl	%ecx,%ebx
+	movdqa	-32(%r14),%xmm10
+	roll	$5,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	pxor	%xmm9,%xmm7
+	pshufd	$238,%xmm6,%xmm9
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	punpcklqdq	%xmm7,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%edi
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%eax
+	paddd	%xmm7,%xmm10
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	movdqa	%xmm0,%xmm9
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	movdqa	%xmm10,48(%rsp)
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	pslld	$2,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	por	%xmm9,%xmm0
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	pshufd	$238,%xmm7,%xmm10
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm0,%xmm10
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm8
+	addl	%eax,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm8,0(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	24(%rsp),%ecx
+	pslld	$2,%xmm1
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm10,%xmm1
+	addl	%edx,%ecx
+	addl	28(%rsp),%ebx
+	pshufd	$238,%xmm0,%xmm8
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	punpcklqdq	%xmm1,%xmm8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	0(%r14),%xmm10
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm9
+	addl	%ebx,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	movdqa	%xmm9,16(%rsp)
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	40(%rsp),%edx
+	pslld	$2,%xmm2
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	por	%xmm8,%xmm2
+	addl	%ebp,%edx
+	addl	44(%rsp),%ecx
+	pshufd	$238,%xmm1,%xmm9
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	punpcklqdq	%xmm2,%xmm9
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	movdqa	%xmm10,%xmm8
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm10
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	movdqa	%xmm10,32(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	56(%rsp),%ebp
+	pslld	$2,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	psrld	$30,%xmm9
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	por	%xmm9,%xmm3
+	addl	%eax,%ebp
+	addl	60(%rsp),%edx
+	pshufd	$238,%xmm2,%xmm10
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pxor	%xmm0,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	punpcklqdq	%xmm3,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebp
+	paddd	%xmm3,%xmm8
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	movdqa	%xmm8,48(%rsp)
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	8(%rsp),%eax
+	pslld	$2,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	por	%xmm10,%xmm4
+	addl	%ebx,%eax
+	addl	12(%rsp),%ebp
+	pshufd	$238,%xmm3,%xmm8
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm1,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	punpcklqdq	%xmm4,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm9
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	movdqa	%xmm9,0(%rsp)
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	24(%rsp),%ebx
+	pslld	$2,%xmm5
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	por	%xmm8,%xmm5
+	addl	%ecx,%ebx
+	addl	28(%rsp),%eax
+	pshufd	$238,%xmm4,%xmm9
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm2,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	punpcklqdq	%xmm5,%xmm9
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	pxor	%xmm7,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%edi
+	paddd	%xmm5,%xmm10
+	xorl	%ecx,%ebx
+	pxor	%xmm9,%xmm6
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movdqa	%xmm6,%xmm9
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	movdqa	%xmm10,16(%rsp)
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	pslld	$2,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	psrld	$30,%xmm9
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	por	%xmm9,%xmm6
+	rorl	$7,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	roll	$5,%edx
+	pshufd	$238,%xmm5,%xmm10
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm3,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	punpcklqdq	%xmm6,%xmm10
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	pxor	%xmm0,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	32(%r14),%xmm9
+	xorl	%ecx,%edi
+	paddd	%xmm6,%xmm8
+	xorl	%edx,%ecx
+	pxor	%xmm10,%xmm7
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movdqa	%xmm7,%xmm10
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	movdqa	%xmm8,32(%rsp)
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	pslld	$2,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	por	%xmm10,%xmm7
+	rorl	$7,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	roll	$5,%ebp
+	pshufd	$238,%xmm6,%xmm8
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm4,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	punpcklqdq	%xmm7,%xmm8
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	pxor	%xmm1,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	movdqa	%xmm9,%xmm10
+	xorl	%edx,%edi
+	paddd	%xmm7,%xmm9
+	xorl	%ebp,%edx
+	pxor	%xmm8,%xmm0
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movdqa	%xmm0,%xmm8
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	movdqa	%xmm9,48(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	pslld	$2,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	por	%xmm8,%xmm0
+	rorl	$7,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	pshufd	$238,%xmm7,%xmm9
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	punpcklqdq	%xmm0,%xmm9
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	pxor	%xmm2,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm8
+	xorl	%ebp,%edi
+	paddd	%xmm0,%xmm10
+	xorl	%eax,%ebp
+	pxor	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movdqa	%xmm1,%xmm9
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	movdqa	%xmm10,0(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	pslld	$2,%xmm1
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	por	%xmm9,%xmm1
+	rorl	$7,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	pshufd	$238,%xmm0,%xmm10
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	punpcklqdq	%xmm1,%xmm10
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	pxor	%xmm3,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm9
+	xorl	%eax,%edi
+	paddd	%xmm1,%xmm8
+	xorl	%ebx,%eax
+	pxor	%xmm10,%xmm2
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movdqa	%xmm2,%xmm10
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	movdqa	%xmm8,16(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	pslld	$2,%xmm2
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	por	%xmm10,%xmm2
+	rorl	$7,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	roll	$5,%ecx
+	pshufd	$238,%xmm1,%xmm8
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm2,%xmm8
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm9
+	addl	%eax,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm9,32(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	56(%rsp),%ecx
+	pslld	$2,%xmm3
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm8,%xmm3
+	addl	%edx,%ecx
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm3,%xmm10
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	%xmm10,48(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_ssse3
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+.byte	102,15,56,0,206
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	paddd	%xmm9,%xmm0
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	movdqa	%xmm0,0(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	psubd	%xmm9,%xmm0
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+.byte	102,15,56,0,214
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	paddd	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	movdqa	%xmm1,16(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	psubd	%xmm9,%xmm1
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+.byte	102,15,56,0,222
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	paddd	%xmm9,%xmm2
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	movdqa	%xmm2,32(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	psubd	%xmm9,%xmm2
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	ret
+.cfi_endproc	
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.globl	sha1_block_data_order_avx
+.hidden sha1_block_data_order_avx
+.type	sha1_block_data_order_avx,@function
+.align	16
+sha1_block_data_order_avx:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	leaq	-64(%rsp),%rsp
+	vzeroupper
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm11,%xmm0,%xmm4
+	vpaddd	%xmm11,%xmm1,%xmm5
+	vpaddd	%xmm11,%xmm2,%xmm6
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	jmp	.Loop_avx
+.align	16
+.Loop_avx:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm8
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm2,%xmm8,%xmm8
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	vpxor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm8
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm10
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm4,%xmm4
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	vpxor	%xmm10,%xmm4,%xmm4
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	vpaddd	%xmm4,%xmm11,%xmm9
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	vpxor	%xmm8,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm10
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm5,%xmm5
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	vpxor	%xmm10,%xmm5,%xmm5
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vmovdqa	-32(%r14),%xmm11
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	vpaddd	%xmm5,%xmm11,%xmm9
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm8
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm8,%xmm8
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm8
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm10
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm6,%xmm6
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	vpxor	%xmm10,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	vpaddd	%xmm6,%xmm11,%xmm9
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm8
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm8,%xmm8
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm8
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpslldq	$12,%xmm7,%xmm10
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm7,%xmm11,%xmm9
+	addl	%esi,%edx
+	andl	%eax,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm1,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm1,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	vmovdqa	0(%r14),%xmm11
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm8,%xmm2,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpor	%xmm8,%xmm2,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	vpaddd	%xmm3,%xmm11,%xmm9
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm8,%xmm4,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm6,%xmm5,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	vpaddd	%xmm4,%xmm11,%xmm9
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	28(%rsp),%eax
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	vpaddd	%xmm5,%xmm11,%xmm9
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	vpsrld	$30,%xmm6,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	vpaddd	%xmm6,%xmm11,%xmm9
+	vmovdqa	32(%r14),%xmm11
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	vpsrld	$30,%xmm7,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	vpaddd	%xmm7,%xmm11,%xmm9
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	vpxor	%xmm2,%xmm1,%xmm1
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	vpor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	vpxor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	vpor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_avx
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm11,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm11,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm11,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroupper
+
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc	
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.globl	sha1_block_data_order_avx2
+.hidden sha1_block_data_order_avx2
+.type	sha1_block_data_order_avx2,@function
+.align	16
+sha1_block_data_order_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	vzeroupper
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	leaq	-640(%rsp),%rsp
+	shlq	$6,%r10
+	leaq	64(%r9),%r13
+	andq	$-128,%rsp
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+	movl	4(%r8),%ebp
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	16(%r8),%esi
+	vmovdqu	64(%r14),%ymm6
+
+	vmovdqu	(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	leaq	64(%r9),%r9
+	vinserti128	$1,(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm3,%ymm3
+
+	vpaddd	%ymm11,%ymm0,%ymm4
+	vpaddd	%ymm11,%ymm1,%ymm5
+	vmovdqu	%ymm4,0(%rsp)
+	vpaddd	%ymm11,%ymm2,%ymm6
+	vmovdqu	%ymm5,32(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	vmovdqu	%ymm6,64(%rsp)
+	vmovdqu	%ymm7,96(%rsp)
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	vpsrldq	$4,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$31,%ymm4,%ymm8
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	vpxor	%ymm10,%ymm4,%ymm4
+	vpaddd	%ymm11,%ymm4,%ymm9
+	vmovdqu	%ymm9,128(%rsp)
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	vpsrldq	$4,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm5,%ymm9
+	vmovdqu	%ymm9,160(%rsp)
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	vpsrldq	$4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$31,%ymm6,%ymm8
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm6,%ymm9
+	vmovdqu	%ymm9,192(%rsp)
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	vpsrldq	$4,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm7,%ymm8
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm7,%ymm7
+	vpaddd	%ymm11,%ymm7,%ymm9
+	vmovdqu	%ymm9,224(%rsp)
+	leaq	128(%rsp),%r13
+	jmp	.Loop_avx2
+.align	32
+.Loop_avx2:
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	jmp	.Lalign32_1
+.align	32
+.Lalign32_1:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpxor	%ymm1,%ymm0,%ymm0
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vpor	%ymm8,%ymm0,%ymm0
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	vmovdqu	%ymm9,256(%rsp)
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpxor	%ymm2,%ymm1,%ymm1
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vpor	%ymm8,%ymm1,%ymm1
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vmovdqu	%ymm9,288(%rsp)
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	vpxor	%ymm3,%ymm2,%ymm2
+	vmovdqu	0(%r14),%ymm11
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vpor	%ymm8,%ymm2,%ymm2
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vmovdqu	%ymm9,320(%rsp)
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	vpxor	%ymm4,%ymm3,%ymm3
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	vpor	%ymm8,%ymm3,%ymm3
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vmovdqu	%ymm9,352(%rsp)
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpalignr	$8,%ymm2,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	vpsrld	$30,%ymm4,%ymm8
+	vpslld	$2,%ymm4,%ymm4
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	vmovdqu	%ymm9,384(%rsp)
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpalignr	$8,%ymm3,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm6,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpxor	%ymm8,%ymm5,%ymm5
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	vpsrld	$30,%ymm5,%ymm8
+	vpslld	$2,%ymm5,%ymm5
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vpor	%ymm8,%ymm5,%ymm5
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	vmovdqu	%ymm9,416(%rsp)
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	vpxor	%ymm8,%ymm6,%ymm6
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	vpsrld	$30,%ymm6,%ymm8
+	vpslld	$2,%ymm6,%ymm6
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpor	%ymm8,%ymm6,%ymm6
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	vmovdqu	%ymm9,448(%rsp)
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm5,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm0,%ymm7,%ymm7
+	vmovdqu	32(%r14),%ymm11
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpxor	%ymm8,%ymm7,%ymm7
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	vpsrld	$30,%ymm7,%ymm8
+	vpslld	$2,%ymm7,%ymm7
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpor	%ymm8,%ymm7,%ymm7
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	vmovdqu	%ymm9,480(%rsp)
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	jmp	.Lalign32_2
+.align	32
+.Lalign32_2:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	vpxor	%ymm1,%ymm0,%ymm0
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	vpor	%ymm8,%ymm0,%ymm0
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	vmovdqu	%ymm9,512(%rsp)
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm2,%ymm1,%ymm1
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	vpor	%ymm8,%ymm1,%ymm1
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	vmovdqu	%ymm9,544(%rsp)
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	vpxor	%ymm3,%ymm2,%ymm2
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	vpor	%ymm8,%ymm2,%ymm2
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,576(%rsp)
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	vpxor	%ymm4,%ymm3,%ymm3
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	vpor	%ymm8,%ymm3,%ymm3
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vmovdqu	%ymm9,608(%rsp)
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%r9),%r13
+	leaq	128(%r9),%rdi
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	je	.Ldone_avx2
+	vmovdqu	64(%r14),%ymm6
+	cmpq	%r10,%rdi
+	ja	.Last_avx2
+
+	vmovdqu	-64(%rdi),%xmm0
+	vmovdqu	-48(%rdi),%xmm1
+	vmovdqu	-32(%rdi),%xmm2
+	vmovdqu	-16(%rdi),%xmm3
+	vinserti128	$1,0(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	jmp	.Last_avx2
+
+.align	32
+.Last_avx2:
+	leaq	128+16(%rsp),%r13
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	subq	$-128,%r9
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm0,%ymm0
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm11,%ymm0,%ymm8
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vmovdqu	%ymm8,0(%rsp)
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	vmovdqu	%ymm9,32(%rsp)
+	vpshufb	%ymm6,%ymm3,%ymm3
+	vpaddd	%ymm11,%ymm2,%ymm6
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	jmp	.Lalign32_3
+.align	32
+.Lalign32_3:
+	vmovdqu	%ymm6,64(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vmovdqu	%ymm7,96(%rsp)
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	vpsrldq	$4,%ymm3,%ymm8
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	vpsrld	$31,%ymm4,%ymm8
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm10,%ymm4,%ymm4
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,128(%rsp)
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrldq	$4,%ymm4,%ymm8
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	vpxor	%ymm10,%ymm5,%ymm5
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vmovdqu	%ymm9,160(%rsp)
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpsrldq	$4,%ymm5,%ymm8
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm8,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpsrld	$31,%ymm6,%ymm8
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	vpxor	%ymm10,%ymm6,%ymm6
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vmovdqu	%ymm9,192(%rsp)
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpsrldq	$4,%ymm6,%ymm8
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm8,%ymm7,%ymm7
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	vpsrld	$31,%ymm7,%ymm8
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	vpxor	%ymm10,%ymm7,%ymm7
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vmovdqu	%ymm9,224(%rsp)
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%rsp),%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	jbe	.Loop_avx2
+
+.Ldone_avx2:
+	vzeroupper
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx2:
+	ret
+.cfi_endproc	
+.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.section	.rodata
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.text	
+#endif
diff --git a/gen/bcm/sha1-x86_64-win.asm b/gen/bcm/sha1-x86_64-win.asm
new file mode 100644
index 0000000..92e9b9c
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-win.asm
@@ -0,0 +1,5768 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+global	sha1_block_data_order_nohw
+
+ALIGN	16
+sha1_block_data_order_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha1_block_data_order_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	mov	r8,rdi
+	sub	rsp,72
+	mov	r9,rsi
+	and	rsp,-64
+	mov	r10,rdx
+	mov	QWORD[64+rsp],rax
+
+$L$prologue:
+
+	mov	esi,DWORD[r8]
+	mov	edi,DWORD[4+r8]
+	mov	r11d,DWORD[8+r8]
+	mov	r12d,DWORD[12+r8]
+	mov	r13d,DWORD[16+r8]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	edx,DWORD[r9]
+	bswap	edx
+	mov	ebp,DWORD[4+r9]
+	mov	eax,r12d
+	mov	DWORD[rsp],edx
+	mov	ecx,esi
+	bswap	ebp
+	xor	eax,r11d
+	rol	ecx,5
+	and	eax,edi
+	lea	r13d,[1518500249+r13*1+rdx]
+	add	r13d,ecx
+	xor	eax,r12d
+	rol	edi,30
+	add	r13d,eax
+	mov	r14d,DWORD[8+r9]
+	mov	eax,r11d
+	mov	DWORD[4+rsp],ebp
+	mov	ecx,r13d
+	bswap	r14d
+	xor	eax,edi
+	rol	ecx,5
+	and	eax,esi
+	lea	r12d,[1518500249+r12*1+rbp]
+	add	r12d,ecx
+	xor	eax,r11d
+	rol	esi,30
+	add	r12d,eax
+	mov	edx,DWORD[12+r9]
+	mov	eax,edi
+	mov	DWORD[8+rsp],r14d
+	mov	ecx,r12d
+	bswap	edx
+	xor	eax,esi
+	rol	ecx,5
+	and	eax,r13d
+	lea	r11d,[1518500249+r11*1+r14]
+	add	r11d,ecx
+	xor	eax,edi
+	rol	r13d,30
+	add	r11d,eax
+	mov	ebp,DWORD[16+r9]
+	mov	eax,esi
+	mov	DWORD[12+rsp],edx
+	mov	ecx,r11d
+	bswap	ebp
+	xor	eax,r13d
+	rol	ecx,5
+	and	eax,r12d
+	lea	edi,[1518500249+rdi*1+rdx]
+	add	edi,ecx
+	xor	eax,esi
+	rol	r12d,30
+	add	edi,eax
+	mov	r14d,DWORD[20+r9]
+	mov	eax,r13d
+	mov	DWORD[16+rsp],ebp
+	mov	ecx,edi
+	bswap	r14d
+	xor	eax,r12d
+	rol	ecx,5
+	and	eax,r11d
+	lea	esi,[1518500249+rsi*1+rbp]
+	add	esi,ecx
+	xor	eax,r13d
+	rol	r11d,30
+	add	esi,eax
+	mov	edx,DWORD[24+r9]
+	mov	eax,r12d
+	mov	DWORD[20+rsp],r14d
+	mov	ecx,esi
+	bswap	edx
+	xor	eax,r11d
+	rol	ecx,5
+	and	eax,edi
+	lea	r13d,[1518500249+r13*1+r14]
+	add	r13d,ecx
+	xor	eax,r12d
+	rol	edi,30
+	add	r13d,eax
+	mov	ebp,DWORD[28+r9]
+	mov	eax,r11d
+	mov	DWORD[24+rsp],edx
+	mov	ecx,r13d
+	bswap	ebp
+	xor	eax,edi
+	rol	ecx,5
+	and	eax,esi
+	lea	r12d,[1518500249+r12*1+rdx]
+	add	r12d,ecx
+	xor	eax,r11d
+	rol	esi,30
+	add	r12d,eax
+	mov	r14d,DWORD[32+r9]
+	mov	eax,edi
+	mov	DWORD[28+rsp],ebp
+	mov	ecx,r12d
+	bswap	r14d
+	xor	eax,esi
+	rol	ecx,5
+	and	eax,r13d
+	lea	r11d,[1518500249+r11*1+rbp]
+	add	r11d,ecx
+	xor	eax,edi
+	rol	r13d,30
+	add	r11d,eax
+	mov	edx,DWORD[36+r9]
+	mov	eax,esi
+	mov	DWORD[32+rsp],r14d
+	mov	ecx,r11d
+	bswap	edx
+	xor	eax,r13d
+	rol	ecx,5
+	and	eax,r12d
+	lea	edi,[1518500249+rdi*1+r14]
+	add	edi,ecx
+	xor	eax,esi
+	rol	r12d,30
+	add	edi,eax
+	mov	ebp,DWORD[40+r9]
+	mov	eax,r13d
+	mov	DWORD[36+rsp],edx
+	mov	ecx,edi
+	bswap	ebp
+	xor	eax,r12d
+	rol	ecx,5
+	and	eax,r11d
+	lea	esi,[1518500249+rsi*1+rdx]
+	add	esi,ecx
+	xor	eax,r13d
+	rol	r11d,30
+	add	esi,eax
+	mov	r14d,DWORD[44+r9]
+	mov	eax,r12d
+	mov	DWORD[40+rsp],ebp
+	mov	ecx,esi
+	bswap	r14d
+	xor	eax,r11d
+	rol	ecx,5
+	and	eax,edi
+	lea	r13d,[1518500249+r13*1+rbp]
+	add	r13d,ecx
+	xor	eax,r12d
+	rol	edi,30
+	add	r13d,eax
+	mov	edx,DWORD[48+r9]
+	mov	eax,r11d
+	mov	DWORD[44+rsp],r14d
+	mov	ecx,r13d
+	bswap	edx
+	xor	eax,edi
+	rol	ecx,5
+	and	eax,esi
+	lea	r12d,[1518500249+r12*1+r14]
+	add	r12d,ecx
+	xor	eax,r11d
+	rol	esi,30
+	add	r12d,eax
+	mov	ebp,DWORD[52+r9]
+	mov	eax,edi
+	mov	DWORD[48+rsp],edx
+	mov	ecx,r12d
+	bswap	ebp
+	xor	eax,esi
+	rol	ecx,5
+	and	eax,r13d
+	lea	r11d,[1518500249+r11*1+rdx]
+	add	r11d,ecx
+	xor	eax,edi
+	rol	r13d,30
+	add	r11d,eax
+	mov	r14d,DWORD[56+r9]
+	mov	eax,esi
+	mov	DWORD[52+rsp],ebp
+	mov	ecx,r11d
+	bswap	r14d
+	xor	eax,r13d
+	rol	ecx,5
+	and	eax,r12d
+	lea	edi,[1518500249+rdi*1+rbp]
+	add	edi,ecx
+	xor	eax,esi
+	rol	r12d,30
+	add	edi,eax
+	mov	edx,DWORD[60+r9]
+	mov	eax,r13d
+	mov	DWORD[56+rsp],r14d
+	mov	ecx,edi
+	bswap	edx
+	xor	eax,r12d
+	rol	ecx,5
+	and	eax,r11d
+	lea	esi,[1518500249+rsi*1+r14]
+	add	esi,ecx
+	xor	eax,r13d
+	rol	r11d,30
+	add	esi,eax
+	xor	ebp,DWORD[rsp]
+	mov	eax,r12d
+	mov	DWORD[60+rsp],edx
+	mov	ecx,esi
+	xor	ebp,DWORD[8+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	ebp,DWORD[32+rsp]
+	and	eax,edi
+	lea	r13d,[1518500249+r13*1+rdx]
+	rol	edi,30
+	xor	eax,r12d
+	add	r13d,ecx
+	rol	ebp,1
+	add	r13d,eax
+	xor	r14d,DWORD[4+rsp]
+	mov	eax,r11d
+	mov	DWORD[rsp],ebp
+	mov	ecx,r13d
+	xor	r14d,DWORD[12+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	r14d,DWORD[36+rsp]
+	and	eax,esi
+	lea	r12d,[1518500249+r12*1+rbp]
+	rol	esi,30
+	xor	eax,r11d
+	add	r12d,ecx
+	rol	r14d,1
+	add	r12d,eax
+	xor	edx,DWORD[8+rsp]
+	mov	eax,edi
+	mov	DWORD[4+rsp],r14d
+	mov	ecx,r12d
+	xor	edx,DWORD[16+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	edx,DWORD[40+rsp]
+	and	eax,r13d
+	lea	r11d,[1518500249+r11*1+r14]
+	rol	r13d,30
+	xor	eax,edi
+	add	r11d,ecx
+	rol	edx,1
+	add	r11d,eax
+	xor	ebp,DWORD[12+rsp]
+	mov	eax,esi
+	mov	DWORD[8+rsp],edx
+	mov	ecx,r11d
+	xor	ebp,DWORD[20+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	ebp,DWORD[44+rsp]
+	and	eax,r12d
+	lea	edi,[1518500249+rdi*1+rdx]
+	rol	r12d,30
+	xor	eax,esi
+	add	edi,ecx
+	rol	ebp,1
+	add	edi,eax
+	xor	r14d,DWORD[16+rsp]
+	mov	eax,r13d
+	mov	DWORD[12+rsp],ebp
+	mov	ecx,edi
+	xor	r14d,DWORD[24+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	r14d,DWORD[48+rsp]
+	and	eax,r11d
+	lea	esi,[1518500249+rsi*1+rbp]
+	rol	r11d,30
+	xor	eax,r13d
+	add	esi,ecx
+	rol	r14d,1
+	add	esi,eax
+	xor	edx,DWORD[20+rsp]
+	mov	eax,edi
+	mov	DWORD[16+rsp],r14d
+	mov	ecx,esi
+	xor	edx,DWORD[28+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	edx,DWORD[52+rsp]
+	lea	r13d,[1859775393+r13*1+r14]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	edx,1
+	xor	ebp,DWORD[24+rsp]
+	mov	eax,esi
+	mov	DWORD[20+rsp],edx
+	mov	ecx,r13d
+	xor	ebp,DWORD[32+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	ebp,DWORD[56+rsp]
+	lea	r12d,[1859775393+r12*1+rdx]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[28+rsp]
+	mov	eax,r13d
+	mov	DWORD[24+rsp],ebp
+	mov	ecx,r12d
+	xor	r14d,DWORD[36+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	r14d,DWORD[60+rsp]
+	lea	r11d,[1859775393+r11*1+rbp]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	r14d,1
+	xor	edx,DWORD[32+rsp]
+	mov	eax,r12d
+	mov	DWORD[28+rsp],r14d
+	mov	ecx,r11d
+	xor	edx,DWORD[40+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	edx,DWORD[rsp]
+	lea	edi,[1859775393+rdi*1+r14]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	edx,1
+	xor	ebp,DWORD[36+rsp]
+	mov	eax,r11d
+	mov	DWORD[32+rsp],edx
+	mov	ecx,edi
+	xor	ebp,DWORD[44+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	ebp,DWORD[4+rsp]
+	lea	esi,[1859775393+rsi*1+rdx]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	ebp,1
+	xor	r14d,DWORD[40+rsp]
+	mov	eax,edi
+	mov	DWORD[36+rsp],ebp
+	mov	ecx,esi
+	xor	r14d,DWORD[48+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	r14d,DWORD[8+rsp]
+	lea	r13d,[1859775393+r13*1+rbp]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	r14d,1
+	xor	edx,DWORD[44+rsp]
+	mov	eax,esi
+	mov	DWORD[40+rsp],r14d
+	mov	ecx,r13d
+	xor	edx,DWORD[52+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	edx,DWORD[12+rsp]
+	lea	r12d,[1859775393+r12*1+r14]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	edx,1
+	xor	ebp,DWORD[48+rsp]
+	mov	eax,r13d
+	mov	DWORD[44+rsp],edx
+	mov	ecx,r12d
+	xor	ebp,DWORD[56+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	ebp,DWORD[16+rsp]
+	lea	r11d,[1859775393+r11*1+rdx]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[52+rsp]
+	mov	eax,r12d
+	mov	DWORD[48+rsp],ebp
+	mov	ecx,r11d
+	xor	r14d,DWORD[60+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	r14d,DWORD[20+rsp]
+	lea	edi,[1859775393+rdi*1+rbp]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	r14d,1
+	xor	edx,DWORD[56+rsp]
+	mov	eax,r11d
+	mov	DWORD[52+rsp],r14d
+	mov	ecx,edi
+	xor	edx,DWORD[rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	edx,DWORD[24+rsp]
+	lea	esi,[1859775393+rsi*1+r14]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	edx,1
+	xor	ebp,DWORD[60+rsp]
+	mov	eax,edi
+	mov	DWORD[56+rsp],edx
+	mov	ecx,esi
+	xor	ebp,DWORD[4+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	ebp,DWORD[28+rsp]
+	lea	r13d,[1859775393+r13*1+rdx]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[rsp]
+	mov	eax,esi
+	mov	DWORD[60+rsp],ebp
+	mov	ecx,r13d
+	xor	r14d,DWORD[8+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	r14d,DWORD[32+rsp]
+	lea	r12d,[1859775393+r12*1+rbp]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	r14d,1
+	xor	edx,DWORD[4+rsp]
+	mov	eax,r13d
+	mov	DWORD[rsp],r14d
+	mov	ecx,r12d
+	xor	edx,DWORD[12+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	edx,DWORD[36+rsp]
+	lea	r11d,[1859775393+r11*1+r14]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	edx,1
+	xor	ebp,DWORD[8+rsp]
+	mov	eax,r12d
+	mov	DWORD[4+rsp],edx
+	mov	ecx,r11d
+	xor	ebp,DWORD[16+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	ebp,DWORD[40+rsp]
+	lea	edi,[1859775393+rdi*1+rdx]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	ebp,1
+	xor	r14d,DWORD[12+rsp]
+	mov	eax,r11d
+	mov	DWORD[8+rsp],ebp
+	mov	ecx,edi
+	xor	r14d,DWORD[20+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	r14d,DWORD[44+rsp]
+	lea	esi,[1859775393+rsi*1+rbp]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	r14d,1
+	xor	edx,DWORD[16+rsp]
+	mov	eax,edi
+	mov	DWORD[12+rsp],r14d
+	mov	ecx,esi
+	xor	edx,DWORD[24+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	edx,DWORD[48+rsp]
+	lea	r13d,[1859775393+r13*1+r14]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	edx,1
+	xor	ebp,DWORD[20+rsp]
+	mov	eax,esi
+	mov	DWORD[16+rsp],edx
+	mov	ecx,r13d
+	xor	ebp,DWORD[28+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	ebp,DWORD[52+rsp]
+	lea	r12d,[1859775393+r12*1+rdx]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[24+rsp]
+	mov	eax,r13d
+	mov	DWORD[20+rsp],ebp
+	mov	ecx,r12d
+	xor	r14d,DWORD[32+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	r14d,DWORD[56+rsp]
+	lea	r11d,[1859775393+r11*1+rbp]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	r14d,1
+	xor	edx,DWORD[28+rsp]
+	mov	eax,r12d
+	mov	DWORD[24+rsp],r14d
+	mov	ecx,r11d
+	xor	edx,DWORD[36+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	edx,DWORD[60+rsp]
+	lea	edi,[1859775393+rdi*1+r14]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	edx,1
+	xor	ebp,DWORD[32+rsp]
+	mov	eax,r11d
+	mov	DWORD[28+rsp],edx
+	mov	ecx,edi
+	xor	ebp,DWORD[40+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	ebp,DWORD[rsp]
+	lea	esi,[1859775393+rsi*1+rdx]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	ebp,1
+	xor	r14d,DWORD[36+rsp]
+	mov	eax,r12d
+	mov	DWORD[32+rsp],ebp
+	mov	ebx,r12d
+	xor	r14d,DWORD[44+rsp]
+	and	eax,r11d
+	mov	ecx,esi
+	xor	r14d,DWORD[4+rsp]
+	lea	r13d,[((-1894007588))+r13*1+rbp]
+	xor	ebx,r11d
+	rol	ecx,5
+	add	r13d,eax
+	rol	r14d,1
+	and	ebx,edi
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,ebx
+	xor	edx,DWORD[40+rsp]
+	mov	eax,r11d
+	mov	DWORD[36+rsp],r14d
+	mov	ebx,r11d
+	xor	edx,DWORD[48+rsp]
+	and	eax,edi
+	mov	ecx,r13d
+	xor	edx,DWORD[8+rsp]
+	lea	r12d,[((-1894007588))+r12*1+r14]
+	xor	ebx,edi
+	rol	ecx,5
+	add	r12d,eax
+	rol	edx,1
+	and	ebx,esi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,ebx
+	xor	ebp,DWORD[44+rsp]
+	mov	eax,edi
+	mov	DWORD[40+rsp],edx
+	mov	ebx,edi
+	xor	ebp,DWORD[52+rsp]
+	and	eax,esi
+	mov	ecx,r12d
+	xor	ebp,DWORD[12+rsp]
+	lea	r11d,[((-1894007588))+r11*1+rdx]
+	xor	ebx,esi
+	rol	ecx,5
+	add	r11d,eax
+	rol	ebp,1
+	and	ebx,r13d
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,ebx
+	xor	r14d,DWORD[48+rsp]
+	mov	eax,esi
+	mov	DWORD[44+rsp],ebp
+	mov	ebx,esi
+	xor	r14d,DWORD[56+rsp]
+	and	eax,r13d
+	mov	ecx,r11d
+	xor	r14d,DWORD[16+rsp]
+	lea	edi,[((-1894007588))+rdi*1+rbp]
+	xor	ebx,r13d
+	rol	ecx,5
+	add	edi,eax
+	rol	r14d,1
+	and	ebx,r12d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,ebx
+	xor	edx,DWORD[52+rsp]
+	mov	eax,r13d
+	mov	DWORD[48+rsp],r14d
+	mov	ebx,r13d
+	xor	edx,DWORD[60+rsp]
+	and	eax,r12d
+	mov	ecx,edi
+	xor	edx,DWORD[20+rsp]
+	lea	esi,[((-1894007588))+rsi*1+r14]
+	xor	ebx,r12d
+	rol	ecx,5
+	add	esi,eax
+	rol	edx,1
+	and	ebx,r11d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,ebx
+	xor	ebp,DWORD[56+rsp]
+	mov	eax,r12d
+	mov	DWORD[52+rsp],edx
+	mov	ebx,r12d
+	xor	ebp,DWORD[rsp]
+	and	eax,r11d
+	mov	ecx,esi
+	xor	ebp,DWORD[24+rsp]
+	lea	r13d,[((-1894007588))+r13*1+rdx]
+	xor	ebx,r11d
+	rol	ecx,5
+	add	r13d,eax
+	rol	ebp,1
+	and	ebx,edi
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,ebx
+	xor	r14d,DWORD[60+rsp]
+	mov	eax,r11d
+	mov	DWORD[56+rsp],ebp
+	mov	ebx,r11d
+	xor	r14d,DWORD[4+rsp]
+	and	eax,edi
+	mov	ecx,r13d
+	xor	r14d,DWORD[28+rsp]
+	lea	r12d,[((-1894007588))+r12*1+rbp]
+	xor	ebx,edi
+	rol	ecx,5
+	add	r12d,eax
+	rol	r14d,1
+	and	ebx,esi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,ebx
+	xor	edx,DWORD[rsp]
+	mov	eax,edi
+	mov	DWORD[60+rsp],r14d
+	mov	ebx,edi
+	xor	edx,DWORD[8+rsp]
+	and	eax,esi
+	mov	ecx,r12d
+	xor	edx,DWORD[32+rsp]
+	lea	r11d,[((-1894007588))+r11*1+r14]
+	xor	ebx,esi
+	rol	ecx,5
+	add	r11d,eax
+	rol	edx,1
+	and	ebx,r13d
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,ebx
+	xor	ebp,DWORD[4+rsp]
+	mov	eax,esi
+	mov	DWORD[rsp],edx
+	mov	ebx,esi
+	xor	ebp,DWORD[12+rsp]
+	and	eax,r13d
+	mov	ecx,r11d
+	xor	ebp,DWORD[36+rsp]
+	lea	edi,[((-1894007588))+rdi*1+rdx]
+	xor	ebx,r13d
+	rol	ecx,5
+	add	edi,eax
+	rol	ebp,1
+	and	ebx,r12d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,ebx
+	xor	r14d,DWORD[8+rsp]
+	mov	eax,r13d
+	mov	DWORD[4+rsp],ebp
+	mov	ebx,r13d
+	xor	r14d,DWORD[16+rsp]
+	and	eax,r12d
+	mov	ecx,edi
+	xor	r14d,DWORD[40+rsp]
+	lea	esi,[((-1894007588))+rsi*1+rbp]
+	xor	ebx,r12d
+	rol	ecx,5
+	add	esi,eax
+	rol	r14d,1
+	and	ebx,r11d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,ebx
+	xor	edx,DWORD[12+rsp]
+	mov	eax,r12d
+	mov	DWORD[8+rsp],r14d
+	mov	ebx,r12d
+	xor	edx,DWORD[20+rsp]
+	and	eax,r11d
+	mov	ecx,esi
+	xor	edx,DWORD[44+rsp]
+	lea	r13d,[((-1894007588))+r13*1+r14]
+	xor	ebx,r11d
+	rol	ecx,5
+	add	r13d,eax
+	rol	edx,1
+	and	ebx,edi
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,ebx
+	xor	ebp,DWORD[16+rsp]
+	mov	eax,r11d
+	mov	DWORD[12+rsp],edx
+	mov	ebx,r11d
+	xor	ebp,DWORD[24+rsp]
+	and	eax,edi
+	mov	ecx,r13d
+	xor	ebp,DWORD[48+rsp]
+	lea	r12d,[((-1894007588))+r12*1+rdx]
+	xor	ebx,edi
+	rol	ecx,5
+	add	r12d,eax
+	rol	ebp,1
+	and	ebx,esi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,ebx
+	xor	r14d,DWORD[20+rsp]
+	mov	eax,edi
+	mov	DWORD[16+rsp],ebp
+	mov	ebx,edi
+	xor	r14d,DWORD[28+rsp]
+	and	eax,esi
+	mov	ecx,r12d
+	xor	r14d,DWORD[52+rsp]
+	lea	r11d,[((-1894007588))+r11*1+rbp]
+	xor	ebx,esi
+	rol	ecx,5
+	add	r11d,eax
+	rol	r14d,1
+	and	ebx,r13d
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,ebx
+	xor	edx,DWORD[24+rsp]
+	mov	eax,esi
+	mov	DWORD[20+rsp],r14d
+	mov	ebx,esi
+	xor	edx,DWORD[32+rsp]
+	and	eax,r13d
+	mov	ecx,r11d
+	xor	edx,DWORD[56+rsp]
+	lea	edi,[((-1894007588))+rdi*1+r14]
+	xor	ebx,r13d
+	rol	ecx,5
+	add	edi,eax
+	rol	edx,1
+	and	ebx,r12d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,ebx
+	xor	ebp,DWORD[28+rsp]
+	mov	eax,r13d
+	mov	DWORD[24+rsp],edx
+	mov	ebx,r13d
+	xor	ebp,DWORD[36+rsp]
+	and	eax,r12d
+	mov	ecx,edi
+	xor	ebp,DWORD[60+rsp]
+	lea	esi,[((-1894007588))+rsi*1+rdx]
+	xor	ebx,r12d
+	rol	ecx,5
+	add	esi,eax
+	rol	ebp,1
+	and	ebx,r11d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,ebx
+	xor	r14d,DWORD[32+rsp]
+	mov	eax,r12d
+	mov	DWORD[28+rsp],ebp
+	mov	ebx,r12d
+	xor	r14d,DWORD[40+rsp]
+	and	eax,r11d
+	mov	ecx,esi
+	xor	r14d,DWORD[rsp]
+	lea	r13d,[((-1894007588))+r13*1+rbp]
+	xor	ebx,r11d
+	rol	ecx,5
+	add	r13d,eax
+	rol	r14d,1
+	and	ebx,edi
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,ebx
+	xor	edx,DWORD[36+rsp]
+	mov	eax,r11d
+	mov	DWORD[32+rsp],r14d
+	mov	ebx,r11d
+	xor	edx,DWORD[44+rsp]
+	and	eax,edi
+	mov	ecx,r13d
+	xor	edx,DWORD[4+rsp]
+	lea	r12d,[((-1894007588))+r12*1+r14]
+	xor	ebx,edi
+	rol	ecx,5
+	add	r12d,eax
+	rol	edx,1
+	and	ebx,esi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,ebx
+	xor	ebp,DWORD[40+rsp]
+	mov	eax,edi
+	mov	DWORD[36+rsp],edx
+	mov	ebx,edi
+	xor	ebp,DWORD[48+rsp]
+	and	eax,esi
+	mov	ecx,r12d
+	xor	ebp,DWORD[8+rsp]
+	lea	r11d,[((-1894007588))+r11*1+rdx]
+	xor	ebx,esi
+	rol	ecx,5
+	add	r11d,eax
+	rol	ebp,1
+	and	ebx,r13d
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,ebx
+	xor	r14d,DWORD[44+rsp]
+	mov	eax,esi
+	mov	DWORD[40+rsp],ebp
+	mov	ebx,esi
+	xor	r14d,DWORD[52+rsp]
+	and	eax,r13d
+	mov	ecx,r11d
+	xor	r14d,DWORD[12+rsp]
+	lea	edi,[((-1894007588))+rdi*1+rbp]
+	xor	ebx,r13d
+	rol	ecx,5
+	add	edi,eax
+	rol	r14d,1
+	and	ebx,r12d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,ebx
+	xor	edx,DWORD[48+rsp]
+	mov	eax,r13d
+	mov	DWORD[44+rsp],r14d
+	mov	ebx,r13d
+	xor	edx,DWORD[56+rsp]
+	and	eax,r12d
+	mov	ecx,edi
+	xor	edx,DWORD[16+rsp]
+	lea	esi,[((-1894007588))+rsi*1+r14]
+	xor	ebx,r12d
+	rol	ecx,5
+	add	esi,eax
+	rol	edx,1
+	and	ebx,r11d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,ebx
+	xor	ebp,DWORD[52+rsp]
+	mov	eax,edi
+	mov	DWORD[48+rsp],edx
+	mov	ecx,esi
+	xor	ebp,DWORD[60+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	ebp,DWORD[20+rsp]
+	lea	r13d,[((-899497514))+r13*1+rdx]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[56+rsp]
+	mov	eax,esi
+	mov	DWORD[52+rsp],ebp
+	mov	ecx,r13d
+	xor	r14d,DWORD[rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	r14d,DWORD[24+rsp]
+	lea	r12d,[((-899497514))+r12*1+rbp]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	r14d,1
+	xor	edx,DWORD[60+rsp]
+	mov	eax,r13d
+	mov	DWORD[56+rsp],r14d
+	mov	ecx,r12d
+	xor	edx,DWORD[4+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	edx,DWORD[28+rsp]
+	lea	r11d,[((-899497514))+r11*1+r14]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	edx,1
+	xor	ebp,DWORD[rsp]
+	mov	eax,r12d
+	mov	DWORD[60+rsp],edx
+	mov	ecx,r11d
+	xor	ebp,DWORD[8+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	ebp,DWORD[32+rsp]
+	lea	edi,[((-899497514))+rdi*1+rdx]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	ebp,1
+	xor	r14d,DWORD[4+rsp]
+	mov	eax,r11d
+	mov	DWORD[rsp],ebp
+	mov	ecx,edi
+	xor	r14d,DWORD[12+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	r14d,DWORD[36+rsp]
+	lea	esi,[((-899497514))+rsi*1+rbp]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	r14d,1
+	xor	edx,DWORD[8+rsp]
+	mov	eax,edi
+	mov	DWORD[4+rsp],r14d
+	mov	ecx,esi
+	xor	edx,DWORD[16+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	edx,DWORD[40+rsp]
+	lea	r13d,[((-899497514))+r13*1+r14]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	edx,1
+	xor	ebp,DWORD[12+rsp]
+	mov	eax,esi
+	mov	DWORD[8+rsp],edx
+	mov	ecx,r13d
+	xor	ebp,DWORD[20+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	ebp,DWORD[44+rsp]
+	lea	r12d,[((-899497514))+r12*1+rdx]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[16+rsp]
+	mov	eax,r13d
+	mov	DWORD[12+rsp],ebp
+	mov	ecx,r12d
+	xor	r14d,DWORD[24+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	r14d,DWORD[48+rsp]
+	lea	r11d,[((-899497514))+r11*1+rbp]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	r14d,1
+	xor	edx,DWORD[20+rsp]
+	mov	eax,r12d
+	mov	DWORD[16+rsp],r14d
+	mov	ecx,r11d
+	xor	edx,DWORD[28+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	edx,DWORD[52+rsp]
+	lea	edi,[((-899497514))+rdi*1+r14]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	edx,1
+	xor	ebp,DWORD[24+rsp]
+	mov	eax,r11d
+	mov	DWORD[20+rsp],edx
+	mov	ecx,edi
+	xor	ebp,DWORD[32+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	ebp,DWORD[56+rsp]
+	lea	esi,[((-899497514))+rsi*1+rdx]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	ebp,1
+	xor	r14d,DWORD[28+rsp]
+	mov	eax,edi
+	mov	DWORD[24+rsp],ebp
+	mov	ecx,esi
+	xor	r14d,DWORD[36+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	r14d,DWORD[60+rsp]
+	lea	r13d,[((-899497514))+r13*1+rbp]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	r14d,1
+	xor	edx,DWORD[32+rsp]
+	mov	eax,esi
+	mov	DWORD[28+rsp],r14d
+	mov	ecx,r13d
+	xor	edx,DWORD[40+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	edx,DWORD[rsp]
+	lea	r12d,[((-899497514))+r12*1+r14]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	edx,1
+	xor	ebp,DWORD[36+rsp]
+	mov	eax,r13d
+
+	mov	ecx,r12d
+	xor	ebp,DWORD[44+rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	ebp,DWORD[4+rsp]
+	lea	r11d,[((-899497514))+r11*1+rdx]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[40+rsp]
+	mov	eax,r12d
+
+	mov	ecx,r11d
+	xor	r14d,DWORD[48+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	r14d,DWORD[8+rsp]
+	lea	edi,[((-899497514))+rdi*1+rbp]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	r14d,1
+	xor	edx,DWORD[44+rsp]
+	mov	eax,r11d
+
+	mov	ecx,edi
+	xor	edx,DWORD[52+rsp]
+	xor	eax,r13d
+	rol	ecx,5
+	xor	edx,DWORD[12+rsp]
+	lea	esi,[((-899497514))+rsi*1+r14]
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	rol	edx,1
+	xor	ebp,DWORD[48+rsp]
+	mov	eax,edi
+
+	mov	ecx,esi
+	xor	ebp,DWORD[56+rsp]
+	xor	eax,r12d
+	rol	ecx,5
+	xor	ebp,DWORD[16+rsp]
+	lea	r13d,[((-899497514))+r13*1+rdx]
+	xor	eax,r11d
+	add	r13d,ecx
+	rol	edi,30
+	add	r13d,eax
+	rol	ebp,1
+	xor	r14d,DWORD[52+rsp]
+	mov	eax,esi
+
+	mov	ecx,r13d
+	xor	r14d,DWORD[60+rsp]
+	xor	eax,r11d
+	rol	ecx,5
+	xor	r14d,DWORD[20+rsp]
+	lea	r12d,[((-899497514))+r12*1+rbp]
+	xor	eax,edi
+	add	r12d,ecx
+	rol	esi,30
+	add	r12d,eax
+	rol	r14d,1
+	xor	edx,DWORD[56+rsp]
+	mov	eax,r13d
+
+	mov	ecx,r12d
+	xor	edx,DWORD[rsp]
+	xor	eax,edi
+	rol	ecx,5
+	xor	edx,DWORD[24+rsp]
+	lea	r11d,[((-899497514))+r11*1+r14]
+	xor	eax,esi
+	add	r11d,ecx
+	rol	r13d,30
+	add	r11d,eax
+	rol	edx,1
+	xor	ebp,DWORD[60+rsp]
+	mov	eax,r12d
+
+	mov	ecx,r11d
+	xor	ebp,DWORD[4+rsp]
+	xor	eax,esi
+	rol	ecx,5
+	xor	ebp,DWORD[28+rsp]
+	lea	edi,[((-899497514))+rdi*1+rdx]
+	xor	eax,r13d
+	add	edi,ecx
+	rol	r12d,30
+	add	edi,eax
+	rol	ebp,1
+	mov	eax,r11d
+	mov	ecx,edi
+	xor	eax,r13d
+	lea	esi,[((-899497514))+rsi*1+rbp]
+	rol	ecx,5
+	xor	eax,r12d
+	add	esi,ecx
+	rol	r11d,30
+	add	esi,eax
+	add	esi,DWORD[r8]
+	add	edi,DWORD[4+r8]
+	add	r11d,DWORD[8+r8]
+	add	r12d,DWORD[12+r8]
+	add	r13d,DWORD[16+r8]
+	mov	DWORD[r8],esi
+	mov	DWORD[4+r8],edi
+	mov	DWORD[8+r8],r11d
+	mov	DWORD[12+r8],r12d
+	mov	DWORD[16+r8],r13d
+
+	sub	r10,1
+	lea	r9,[64+r9]
+	jnz	NEAR $L$loop
+
+	mov	rsi,QWORD[64+rsp]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha1_block_data_order_nohw:
+global	sha1_block_data_order_hw
+
+ALIGN	32
+sha1_block_data_order_hw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha1_block_data_order_hw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-72))+rsp]
+	movaps	XMMWORD[(-8-64)+rax],xmm6
+	movaps	XMMWORD[(-8-48)+rax],xmm7
+	movaps	XMMWORD[(-8-32)+rax],xmm8
+	movaps	XMMWORD[(-8-16)+rax],xmm9
+$L$prologue_shaext:
+	movdqu	xmm0,XMMWORD[rdi]
+	movd	xmm1,DWORD[16+rdi]
+	movdqa	xmm3,XMMWORD[((K_XX_XX+160))]
+
+	movdqu	xmm4,XMMWORD[rsi]
+	pshufd	xmm0,xmm0,27
+	movdqu	xmm5,XMMWORD[16+rsi]
+	pshufd	xmm1,xmm1,27
+	movdqu	xmm6,XMMWORD[32+rsi]
+DB	102,15,56,0,227
+	movdqu	xmm7,XMMWORD[48+rsi]
+DB	102,15,56,0,235
+DB	102,15,56,0,243
+	movdqa	xmm9,xmm1
+DB	102,15,56,0,251
+	jmp	NEAR $L$oop_shaext
+
+ALIGN	16
+$L$oop_shaext:
+	dec	rdx
+	lea	r8,[64+rsi]
+	paddd	xmm1,xmm4
+	cmovne	rsi,r8
+	prefetcht0	[512+rsi]
+	movdqa	xmm8,xmm0
+	DB	15,56,201,229
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,0
+	DB	15,56,200,213
+	pxor	xmm4,xmm6
+	DB	15,56,201,238
+	DB	15,56,202,231
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,0
+	DB	15,56,200,206
+	pxor	xmm5,xmm7
+	DB	15,56,202,236
+	DB	15,56,201,247
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,0
+	DB	15,56,200,215
+	pxor	xmm6,xmm4
+	DB	15,56,201,252
+	DB	15,56,202,245
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,0
+	DB	15,56,200,204
+	pxor	xmm7,xmm5
+	DB	15,56,202,254
+	DB	15,56,201,229
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,0
+	DB	15,56,200,213
+	pxor	xmm4,xmm6
+	DB	15,56,201,238
+	DB	15,56,202,231
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,1
+	DB	15,56,200,206
+	pxor	xmm5,xmm7
+	DB	15,56,202,236
+	DB	15,56,201,247
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,1
+	DB	15,56,200,215
+	pxor	xmm6,xmm4
+	DB	15,56,201,252
+	DB	15,56,202,245
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,1
+	DB	15,56,200,204
+	pxor	xmm7,xmm5
+	DB	15,56,202,254
+	DB	15,56,201,229
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,1
+	DB	15,56,200,213
+	pxor	xmm4,xmm6
+	DB	15,56,201,238
+	DB	15,56,202,231
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,1
+	DB	15,56,200,206
+	pxor	xmm5,xmm7
+	DB	15,56,202,236
+	DB	15,56,201,247
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,2
+	DB	15,56,200,215
+	pxor	xmm6,xmm4
+	DB	15,56,201,252
+	DB	15,56,202,245
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,2
+	DB	15,56,200,204
+	pxor	xmm7,xmm5
+	DB	15,56,202,254
+	DB	15,56,201,229
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,2
+	DB	15,56,200,213
+	pxor	xmm4,xmm6
+	DB	15,56,201,238
+	DB	15,56,202,231
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,2
+	DB	15,56,200,206
+	pxor	xmm5,xmm7
+	DB	15,56,202,236
+	DB	15,56,201,247
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,2
+	DB	15,56,200,215
+	pxor	xmm6,xmm4
+	DB	15,56,201,252
+	DB	15,56,202,245
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,3
+	DB	15,56,200,204
+	pxor	xmm7,xmm5
+	DB	15,56,202,254
+	movdqu	xmm4,XMMWORD[rsi]
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,3
+	DB	15,56,200,213
+	movdqu	xmm5,XMMWORD[16+rsi]
+DB	102,15,56,0,227
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,3
+	DB	15,56,200,206
+	movdqu	xmm6,XMMWORD[32+rsi]
+DB	102,15,56,0,235
+
+	movdqa	xmm2,xmm0
+	DB	15,58,204,193,3
+	DB	15,56,200,215
+	movdqu	xmm7,XMMWORD[48+rsi]
+DB	102,15,56,0,243
+
+	movdqa	xmm1,xmm0
+	DB	15,58,204,194,3
+	DB	65,15,56,200,201
+DB	102,15,56,0,251
+
+	paddd	xmm0,xmm8
+	movdqa	xmm9,xmm1
+
+	jnz	NEAR $L$oop_shaext
+
+	pshufd	xmm0,xmm0,27
+	pshufd	xmm1,xmm1,27
+	movdqu	XMMWORD[rdi],xmm0
+	movd	DWORD[16+rdi],xmm1
+	movaps	xmm6,XMMWORD[((-8-64))+rax]
+	movaps	xmm7,XMMWORD[((-8-48))+rax]
+	movaps	xmm8,XMMWORD[((-8-32))+rax]
+	movaps	xmm9,XMMWORD[((-8-16))+rax]
+	mov	rsp,rax
+$L$epilogue_shaext:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha1_block_data_order_hw:
+global	sha1_block_data_order_ssse3
+
+ALIGN	16
+sha1_block_data_order_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha1_block_data_order_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	r11,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	lea	rsp,[((-160))+rsp]
+	movaps	XMMWORD[(-40-96)+r11],xmm6
+	movaps	XMMWORD[(-40-80)+r11],xmm7
+	movaps	XMMWORD[(-40-64)+r11],xmm8
+	movaps	XMMWORD[(-40-48)+r11],xmm9
+	movaps	XMMWORD[(-40-32)+r11],xmm10
+	movaps	XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_ssse3:
+	and	rsp,-64
+	mov	r8,rdi
+	mov	r9,rsi
+	mov	r10,rdx
+
+	shl	r10,6
+	add	r10,r9
+	lea	r14,[((K_XX_XX+64))]
+
+	mov	eax,DWORD[r8]
+	mov	ebx,DWORD[4+r8]
+	mov	ecx,DWORD[8+r8]
+	mov	edx,DWORD[12+r8]
+	mov	esi,ebx
+	mov	ebp,DWORD[16+r8]
+	mov	edi,ecx
+	xor	edi,edx
+	and	esi,edi
+
+	movdqa	xmm6,XMMWORD[64+r14]
+	movdqa	xmm9,XMMWORD[((-64))+r14]
+	movdqu	xmm0,XMMWORD[r9]
+	movdqu	xmm1,XMMWORD[16+r9]
+	movdqu	xmm2,XMMWORD[32+r9]
+	movdqu	xmm3,XMMWORD[48+r9]
+DB	102,15,56,0,198
+DB	102,15,56,0,206
+DB	102,15,56,0,214
+	add	r9,64
+	paddd	xmm0,xmm9
+DB	102,15,56,0,222
+	paddd	xmm1,xmm9
+	paddd	xmm2,xmm9
+	movdqa	XMMWORD[rsp],xmm0
+	psubd	xmm0,xmm9
+	movdqa	XMMWORD[16+rsp],xmm1
+	psubd	xmm1,xmm9
+	movdqa	XMMWORD[32+rsp],xmm2
+	psubd	xmm2,xmm9
+	jmp	NEAR $L$oop_ssse3
+ALIGN	16
+$L$oop_ssse3:
+	ror	ebx,2
+	pshufd	xmm4,xmm0,238
+	xor	esi,edx
+	movdqa	xmm8,xmm3
+	paddd	xmm9,xmm3
+	mov	edi,eax
+	add	ebp,DWORD[rsp]
+	punpcklqdq	xmm4,xmm1
+	xor	ebx,ecx
+	rol	eax,5
+	add	ebp,esi
+	psrldq	xmm8,4
+	and	edi,ebx
+	xor	ebx,ecx
+	pxor	xmm4,xmm0
+	add	ebp,eax
+	ror	eax,7
+	pxor	xmm8,xmm2
+	xor	edi,ecx
+	mov	esi,ebp
+	add	edx,DWORD[4+rsp]
+	pxor	xmm4,xmm8
+	xor	eax,ebx
+	rol	ebp,5
+	movdqa	XMMWORD[48+rsp],xmm9
+	add	edx,edi
+	and	esi,eax
+	movdqa	xmm10,xmm4
+	xor	eax,ebx
+	add	edx,ebp
+	ror	ebp,7
+	movdqa	xmm8,xmm4
+	xor	esi,ebx
+	pslldq	xmm10,12
+	paddd	xmm4,xmm4
+	mov	edi,edx
+	add	ecx,DWORD[8+rsp]
+	psrld	xmm8,31
+	xor	ebp,eax
+	rol	edx,5
+	add	ecx,esi
+	movdqa	xmm9,xmm10
+	and	edi,ebp
+	xor	ebp,eax
+	psrld	xmm10,30
+	add	ecx,edx
+	ror	edx,7
+	por	xmm4,xmm8
+	xor	edi,eax
+	mov	esi,ecx
+	add	ebx,DWORD[12+rsp]
+	pslld	xmm9,2
+	pxor	xmm4,xmm10
+	xor	edx,ebp
+	movdqa	xmm10,XMMWORD[((-64))+r14]
+	rol	ecx,5
+	add	ebx,edi
+	and	esi,edx
+	pxor	xmm4,xmm9
+	xor	edx,ebp
+	add	ebx,ecx
+	ror	ecx,7
+	pshufd	xmm5,xmm1,238
+	xor	esi,ebp
+	movdqa	xmm9,xmm4
+	paddd	xmm10,xmm4
+	mov	edi,ebx
+	add	eax,DWORD[16+rsp]
+	punpcklqdq	xmm5,xmm2
+	xor	ecx,edx
+	rol	ebx,5
+	add	eax,esi
+	psrldq	xmm9,4
+	and	edi,ecx
+	xor	ecx,edx
+	pxor	xmm5,xmm1
+	add	eax,ebx
+	ror	ebx,7
+	pxor	xmm9,xmm3
+	xor	edi,edx
+	mov	esi,eax
+	add	ebp,DWORD[20+rsp]
+	pxor	xmm5,xmm9
+	xor	ebx,ecx
+	rol	eax,5
+	movdqa	XMMWORD[rsp],xmm10
+	add	ebp,edi
+	and	esi,ebx
+	movdqa	xmm8,xmm5
+	xor	ebx,ecx
+	add	ebp,eax
+	ror	eax,7
+	movdqa	xmm9,xmm5
+	xor	esi,ecx
+	pslldq	xmm8,12
+	paddd	xmm5,xmm5
+	mov	edi,ebp
+	add	edx,DWORD[24+rsp]
+	psrld	xmm9,31
+	xor	eax,ebx
+	rol	ebp,5
+	add	edx,esi
+	movdqa	xmm10,xmm8
+	and	edi,eax
+	xor	eax,ebx
+	psrld	xmm8,30
+	add	edx,ebp
+	ror	ebp,7
+	por	xmm5,xmm9
+	xor	edi,ebx
+	mov	esi,edx
+	add	ecx,DWORD[28+rsp]
+	pslld	xmm10,2
+	pxor	xmm5,xmm8
+	xor	ebp,eax
+	movdqa	xmm8,XMMWORD[((-32))+r14]
+	rol	edx,5
+	add	ecx,edi
+	and	esi,ebp
+	pxor	xmm5,xmm10
+	xor	ebp,eax
+	add	ecx,edx
+	ror	edx,7
+	pshufd	xmm6,xmm2,238
+	xor	esi,eax
+	movdqa	xmm10,xmm5
+	paddd	xmm8,xmm5
+	mov	edi,ecx
+	add	ebx,DWORD[32+rsp]
+	punpcklqdq	xmm6,xmm3
+	xor	edx,ebp
+	rol	ecx,5
+	add	ebx,esi
+	psrldq	xmm10,4
+	and	edi,edx
+	xor	edx,ebp
+	pxor	xmm6,xmm2
+	add	ebx,ecx
+	ror	ecx,7
+	pxor	xmm10,xmm4
+	xor	edi,ebp
+	mov	esi,ebx
+	add	eax,DWORD[36+rsp]
+	pxor	xmm6,xmm10
+	xor	ecx,edx
+	rol	ebx,5
+	movdqa	XMMWORD[16+rsp],xmm8
+	add	eax,edi
+	and	esi,ecx
+	movdqa	xmm9,xmm6
+	xor	ecx,edx
+	add	eax,ebx
+	ror	ebx,7
+	movdqa	xmm10,xmm6
+	xor	esi,edx
+	pslldq	xmm9,12
+	paddd	xmm6,xmm6
+	mov	edi,eax
+	add	ebp,DWORD[40+rsp]
+	psrld	xmm10,31
+	xor	ebx,ecx
+	rol	eax,5
+	add	ebp,esi
+	movdqa	xmm8,xmm9
+	and	edi,ebx
+	xor	ebx,ecx
+	psrld	xmm9,30
+	add	ebp,eax
+	ror	eax,7
+	por	xmm6,xmm10
+	xor	edi,ecx
+	mov	esi,ebp
+	add	edx,DWORD[44+rsp]
+	pslld	xmm8,2
+	pxor	xmm6,xmm9
+	xor	eax,ebx
+	movdqa	xmm9,XMMWORD[((-32))+r14]
+	rol	ebp,5
+	add	edx,edi
+	and	esi,eax
+	pxor	xmm6,xmm8
+	xor	eax,ebx
+	add	edx,ebp
+	ror	ebp,7
+	pshufd	xmm7,xmm3,238
+	xor	esi,ebx
+	movdqa	xmm8,xmm6
+	paddd	xmm9,xmm6
+	mov	edi,edx
+	add	ecx,DWORD[48+rsp]
+	punpcklqdq	xmm7,xmm4
+	xor	ebp,eax
+	rol	edx,5
+	add	ecx,esi
+	psrldq	xmm8,4
+	and	edi,ebp
+	xor	ebp,eax
+	pxor	xmm7,xmm3
+	add	ecx,edx
+	ror	edx,7
+	pxor	xmm8,xmm5
+	xor	edi,eax
+	mov	esi,ecx
+	add	ebx,DWORD[52+rsp]
+	pxor	xmm7,xmm8
+	xor	edx,ebp
+	rol	ecx,5
+	movdqa	XMMWORD[32+rsp],xmm9
+	add	ebx,edi
+	and	esi,edx
+	movdqa	xmm10,xmm7
+	xor	edx,ebp
+	add	ebx,ecx
+	ror	ecx,7
+	movdqa	xmm8,xmm7
+	xor	esi,ebp
+	pslldq	xmm10,12
+	paddd	xmm7,xmm7
+	mov	edi,ebx
+	add	eax,DWORD[56+rsp]
+	psrld	xmm8,31
+	xor	ecx,edx
+	rol	ebx,5
+	add	eax,esi
+	movdqa	xmm9,xmm10
+	and	edi,ecx
+	xor	ecx,edx
+	psrld	xmm10,30
+	add	eax,ebx
+	ror	ebx,7
+	por	xmm7,xmm8
+	xor	edi,edx
+	mov	esi,eax
+	add	ebp,DWORD[60+rsp]
+	pslld	xmm9,2
+	pxor	xmm7,xmm10
+	xor	ebx,ecx
+	movdqa	xmm10,XMMWORD[((-32))+r14]
+	rol	eax,5
+	add	ebp,edi
+	and	esi,ebx
+	pxor	xmm7,xmm9
+	pshufd	xmm9,xmm6,238
+	xor	ebx,ecx
+	add	ebp,eax
+	ror	eax,7
+	pxor	xmm0,xmm4
+	xor	esi,ecx
+	mov	edi,ebp
+	add	edx,DWORD[rsp]
+	punpcklqdq	xmm9,xmm7
+	xor	eax,ebx
+	rol	ebp,5
+	pxor	xmm0,xmm1
+	add	edx,esi
+	and	edi,eax
+	movdqa	xmm8,xmm10
+	xor	eax,ebx
+	paddd	xmm10,xmm7
+	add	edx,ebp
+	pxor	xmm0,xmm9
+	ror	ebp,7
+	xor	edi,ebx
+	mov	esi,edx
+	add	ecx,DWORD[4+rsp]
+	movdqa	xmm9,xmm0
+	xor	ebp,eax
+	rol	edx,5
+	movdqa	XMMWORD[48+rsp],xmm10
+	add	ecx,edi
+	and	esi,ebp
+	xor	ebp,eax
+	pslld	xmm0,2
+	add	ecx,edx
+	ror	edx,7
+	psrld	xmm9,30
+	xor	esi,eax
+	mov	edi,ecx
+	add	ebx,DWORD[8+rsp]
+	por	xmm0,xmm9
+	xor	edx,ebp
+	rol	ecx,5
+	pshufd	xmm10,xmm7,238
+	add	ebx,esi
+	and	edi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[12+rsp]
+	xor	edi,ebp
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,edx
+	ror	ecx,7
+	add	eax,ebx
+	pxor	xmm1,xmm5
+	add	ebp,DWORD[16+rsp]
+	xor	esi,ecx
+	punpcklqdq	xmm10,xmm0
+	mov	edi,eax
+	rol	eax,5
+	pxor	xmm1,xmm2
+	add	ebp,esi
+	xor	edi,ecx
+	movdqa	xmm9,xmm8
+	ror	ebx,7
+	paddd	xmm8,xmm0
+	add	ebp,eax
+	pxor	xmm1,xmm10
+	add	edx,DWORD[20+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	movdqa	xmm10,xmm1
+	add	edx,edi
+	xor	esi,ebx
+	movdqa	XMMWORD[rsp],xmm8
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[24+rsp]
+	pslld	xmm1,2
+	xor	esi,eax
+	mov	edi,edx
+	psrld	xmm10,30
+	rol	edx,5
+	add	ecx,esi
+	xor	edi,eax
+	ror	ebp,7
+	por	xmm1,xmm10
+	add	ecx,edx
+	add	ebx,DWORD[28+rsp]
+	pshufd	xmm8,xmm0,238
+	xor	edi,ebp
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	pxor	xmm2,xmm6
+	add	eax,DWORD[32+rsp]
+	xor	esi,edx
+	punpcklqdq	xmm8,xmm1
+	mov	edi,ebx
+	rol	ebx,5
+	pxor	xmm2,xmm3
+	add	eax,esi
+	xor	edi,edx
+	movdqa	xmm10,XMMWORD[r14]
+	ror	ecx,7
+	paddd	xmm9,xmm1
+	add	eax,ebx
+	pxor	xmm2,xmm8
+	add	ebp,DWORD[36+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	rol	eax,5
+	movdqa	xmm8,xmm2
+	add	ebp,edi
+	xor	esi,ecx
+	movdqa	XMMWORD[16+rsp],xmm9
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[40+rsp]
+	pslld	xmm2,2
+	xor	esi,ebx
+	mov	edi,ebp
+	psrld	xmm8,30
+	rol	ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	ror	eax,7
+	por	xmm2,xmm8
+	add	edx,ebp
+	add	ecx,DWORD[44+rsp]
+	pshufd	xmm9,xmm1,238
+	xor	edi,eax
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,eax
+	ror	ebp,7
+	add	ecx,edx
+	pxor	xmm3,xmm7
+	add	ebx,DWORD[48+rsp]
+	xor	esi,ebp
+	punpcklqdq	xmm9,xmm2
+	mov	edi,ecx
+	rol	ecx,5
+	pxor	xmm3,xmm4
+	add	ebx,esi
+	xor	edi,ebp
+	movdqa	xmm8,xmm10
+	ror	edx,7
+	paddd	xmm10,xmm2
+	add	ebx,ecx
+	pxor	xmm3,xmm9
+	add	eax,DWORD[52+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	rol	ebx,5
+	movdqa	xmm9,xmm3
+	add	eax,edi
+	xor	esi,edx
+	movdqa	XMMWORD[32+rsp],xmm10
+	ror	ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[56+rsp]
+	pslld	xmm3,2
+	xor	esi,ecx
+	mov	edi,eax
+	psrld	xmm9,30
+	rol	eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	ror	ebx,7
+	por	xmm3,xmm9
+	add	ebp,eax
+	add	edx,DWORD[60+rsp]
+	pshufd	xmm10,xmm2,238
+	xor	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,ebp
+	pxor	xmm4,xmm0
+	add	ecx,DWORD[rsp]
+	xor	esi,eax
+	punpcklqdq	xmm10,xmm3
+	mov	edi,edx
+	rol	edx,5
+	pxor	xmm4,xmm5
+	add	ecx,esi
+	xor	edi,eax
+	movdqa	xmm9,xmm8
+	ror	ebp,7
+	paddd	xmm8,xmm3
+	add	ecx,edx
+	pxor	xmm4,xmm10
+	add	ebx,DWORD[4+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	rol	ecx,5
+	movdqa	xmm10,xmm4
+	add	ebx,edi
+	xor	esi,ebp
+	movdqa	XMMWORD[48+rsp],xmm8
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[8+rsp]
+	pslld	xmm4,2
+	xor	esi,edx
+	mov	edi,ebx
+	psrld	xmm10,30
+	rol	ebx,5
+	add	eax,esi
+	xor	edi,edx
+	ror	ecx,7
+	por	xmm4,xmm10
+	add	eax,ebx
+	add	ebp,DWORD[12+rsp]
+	pshufd	xmm8,xmm3,238
+	xor	edi,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	pxor	xmm5,xmm1
+	add	edx,DWORD[16+rsp]
+	xor	esi,ebx
+	punpcklqdq	xmm8,xmm4
+	mov	edi,ebp
+	rol	ebp,5
+	pxor	xmm5,xmm6
+	add	edx,esi
+	xor	edi,ebx
+	movdqa	xmm10,xmm9
+	ror	eax,7
+	paddd	xmm9,xmm4
+	add	edx,ebp
+	pxor	xmm5,xmm8
+	add	ecx,DWORD[20+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	rol	edx,5
+	movdqa	xmm8,xmm5
+	add	ecx,edi
+	xor	esi,eax
+	movdqa	XMMWORD[rsp],xmm9
+	ror	ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[24+rsp]
+	pslld	xmm5,2
+	xor	esi,ebp
+	mov	edi,ecx
+	psrld	xmm8,30
+	rol	ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	ror	edx,7
+	por	xmm5,xmm8
+	add	ebx,ecx
+	add	eax,DWORD[28+rsp]
+	pshufd	xmm9,xmm4,238
+	ror	ecx,7
+	mov	esi,ebx
+	xor	edi,edx
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	pxor	xmm6,xmm2
+	add	ebp,DWORD[32+rsp]
+	and	esi,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	punpcklqdq	xmm9,xmm5
+	mov	edi,eax
+	xor	esi,ecx
+	pxor	xmm6,xmm7
+	rol	eax,5
+	add	ebp,esi
+	movdqa	xmm8,xmm10
+	xor	edi,ebx
+	paddd	xmm10,xmm5
+	xor	ebx,ecx
+	pxor	xmm6,xmm9
+	add	ebp,eax
+	add	edx,DWORD[36+rsp]
+	and	edi,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	movdqa	xmm9,xmm6
+	mov	esi,ebp
+	xor	edi,ebx
+	movdqa	XMMWORD[16+rsp],xmm10
+	rol	ebp,5
+	add	edx,edi
+	xor	esi,eax
+	pslld	xmm6,2
+	xor	eax,ebx
+	add	edx,ebp
+	psrld	xmm9,30
+	add	ecx,DWORD[40+rsp]
+	and	esi,eax
+	xor	eax,ebx
+	por	xmm6,xmm9
+	ror	ebp,7
+	mov	edi,edx
+	xor	esi,eax
+	rol	edx,5
+	pshufd	xmm10,xmm5,238
+	add	ecx,esi
+	xor	edi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	add	ebx,DWORD[44+rsp]
+	and	edi,ebp
+	xor	ebp,eax
+	ror	edx,7
+	mov	esi,ecx
+	xor	edi,ebp
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	pxor	xmm7,xmm3
+	add	eax,DWORD[48+rsp]
+	and	esi,edx
+	xor	edx,ebp
+	ror	ecx,7
+	punpcklqdq	xmm10,xmm6
+	mov	edi,ebx
+	xor	esi,edx
+	pxor	xmm7,xmm0
+	rol	ebx,5
+	add	eax,esi
+	movdqa	xmm9,XMMWORD[32+r14]
+	xor	edi,ecx
+	paddd	xmm8,xmm6
+	xor	ecx,edx
+	pxor	xmm7,xmm10
+	add	eax,ebx
+	add	ebp,DWORD[52+rsp]
+	and	edi,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	movdqa	xmm10,xmm7
+	mov	esi,eax
+	xor	edi,ecx
+	movdqa	XMMWORD[32+rsp],xmm8
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ebx
+	pslld	xmm7,2
+	xor	ebx,ecx
+	add	ebp,eax
+	psrld	xmm10,30
+	add	edx,DWORD[56+rsp]
+	and	esi,ebx
+	xor	ebx,ecx
+	por	xmm7,xmm10
+	ror	eax,7
+	mov	edi,ebp
+	xor	esi,ebx
+	rol	ebp,5
+	pshufd	xmm8,xmm6,238
+	add	edx,esi
+	xor	edi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	add	ecx,DWORD[60+rsp]
+	and	edi,eax
+	xor	eax,ebx
+	ror	ebp,7
+	mov	esi,edx
+	xor	edi,eax
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	pxor	xmm0,xmm4
+	add	ebx,DWORD[rsp]
+	and	esi,ebp
+	xor	ebp,eax
+	ror	edx,7
+	punpcklqdq	xmm8,xmm7
+	mov	edi,ecx
+	xor	esi,ebp
+	pxor	xmm0,xmm1
+	rol	ecx,5
+	add	ebx,esi
+	movdqa	xmm10,xmm9
+	xor	edi,edx
+	paddd	xmm9,xmm7
+	xor	edx,ebp
+	pxor	xmm0,xmm8
+	add	ebx,ecx
+	add	eax,DWORD[4+rsp]
+	and	edi,edx
+	xor	edx,ebp
+	ror	ecx,7
+	movdqa	xmm8,xmm0
+	mov	esi,ebx
+	xor	edi,edx
+	movdqa	XMMWORD[48+rsp],xmm9
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,ecx
+	pslld	xmm0,2
+	xor	ecx,edx
+	add	eax,ebx
+	psrld	xmm8,30
+	add	ebp,DWORD[8+rsp]
+	and	esi,ecx
+	xor	ecx,edx
+	por	xmm0,xmm8
+	ror	ebx,7
+	mov	edi,eax
+	xor	esi,ecx
+	rol	eax,5
+	pshufd	xmm9,xmm7,238
+	add	ebp,esi
+	xor	edi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	add	edx,DWORD[12+rsp]
+	and	edi,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	mov	esi,ebp
+	xor	edi,ebx
+	rol	ebp,5
+	add	edx,edi
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	pxor	xmm1,xmm5
+	add	ecx,DWORD[16+rsp]
+	and	esi,eax
+	xor	eax,ebx
+	ror	ebp,7
+	punpcklqdq	xmm9,xmm0
+	mov	edi,edx
+	xor	esi,eax
+	pxor	xmm1,xmm2
+	rol	edx,5
+	add	ecx,esi
+	movdqa	xmm8,xmm10
+	xor	edi,ebp
+	paddd	xmm10,xmm0
+	xor	ebp,eax
+	pxor	xmm1,xmm9
+	add	ecx,edx
+	add	ebx,DWORD[20+rsp]
+	and	edi,ebp
+	xor	ebp,eax
+	ror	edx,7
+	movdqa	xmm9,xmm1
+	mov	esi,ecx
+	xor	edi,ebp
+	movdqa	XMMWORD[rsp],xmm10
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,edx
+	pslld	xmm1,2
+	xor	edx,ebp
+	add	ebx,ecx
+	psrld	xmm9,30
+	add	eax,DWORD[24+rsp]
+	and	esi,edx
+	xor	edx,ebp
+	por	xmm1,xmm9
+	ror	ecx,7
+	mov	edi,ebx
+	xor	esi,edx
+	rol	ebx,5
+	pshufd	xmm10,xmm0,238
+	add	eax,esi
+	xor	edi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	ebp,DWORD[28+rsp]
+	and	edi,ecx
+	xor	ecx,edx
+	ror	ebx,7
+	mov	esi,eax
+	xor	edi,ecx
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	pxor	xmm2,xmm6
+	add	edx,DWORD[32+rsp]
+	and	esi,ebx
+	xor	ebx,ecx
+	ror	eax,7
+	punpcklqdq	xmm10,xmm1
+	mov	edi,ebp
+	xor	esi,ebx
+	pxor	xmm2,xmm3
+	rol	ebp,5
+	add	edx,esi
+	movdqa	xmm9,xmm8
+	xor	edi,eax
+	paddd	xmm8,xmm1
+	xor	eax,ebx
+	pxor	xmm2,xmm10
+	add	edx,ebp
+	add	ecx,DWORD[36+rsp]
+	and	edi,eax
+	xor	eax,ebx
+	ror	ebp,7
+	movdqa	xmm10,xmm2
+	mov	esi,edx
+	xor	edi,eax
+	movdqa	XMMWORD[16+rsp],xmm8
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,ebp
+	pslld	xmm2,2
+	xor	ebp,eax
+	add	ecx,edx
+	psrld	xmm10,30
+	add	ebx,DWORD[40+rsp]
+	and	esi,ebp
+	xor	ebp,eax
+	por	xmm2,xmm10
+	ror	edx,7
+	mov	edi,ecx
+	xor	esi,ebp
+	rol	ecx,5
+	pshufd	xmm8,xmm1,238
+	add	ebx,esi
+	xor	edi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[44+rsp]
+	and	edi,edx
+	xor	edx,ebp
+	ror	ecx,7
+	mov	esi,ebx
+	xor	edi,edx
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,edx
+	add	eax,ebx
+	pxor	xmm3,xmm7
+	add	ebp,DWORD[48+rsp]
+	xor	esi,ecx
+	punpcklqdq	xmm8,xmm2
+	mov	edi,eax
+	rol	eax,5
+	pxor	xmm3,xmm4
+	add	ebp,esi
+	xor	edi,ecx
+	movdqa	xmm10,xmm9
+	ror	ebx,7
+	paddd	xmm9,xmm2
+	add	ebp,eax
+	pxor	xmm3,xmm8
+	add	edx,DWORD[52+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	movdqa	xmm8,xmm3
+	add	edx,edi
+	xor	esi,ebx
+	movdqa	XMMWORD[32+rsp],xmm9
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[56+rsp]
+	pslld	xmm3,2
+	xor	esi,eax
+	mov	edi,edx
+	psrld	xmm8,30
+	rol	edx,5
+	add	ecx,esi
+	xor	edi,eax
+	ror	ebp,7
+	por	xmm3,xmm8
+	add	ecx,edx
+	add	ebx,DWORD[60+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	rol	ebx,5
+	paddd	xmm10,xmm3
+	add	eax,esi
+	xor	edi,edx
+	movdqa	XMMWORD[48+rsp],xmm10
+	ror	ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[4+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[8+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	rol	ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[12+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,eax
+	ror	ebp,7
+	add	ecx,edx
+	cmp	r9,r10
+	je	NEAR $L$done_ssse3
+	movdqa	xmm6,XMMWORD[64+r14]
+	movdqa	xmm9,XMMWORD[((-64))+r14]
+	movdqu	xmm0,XMMWORD[r9]
+	movdqu	xmm1,XMMWORD[16+r9]
+	movdqu	xmm2,XMMWORD[32+r9]
+	movdqu	xmm3,XMMWORD[48+r9]
+DB	102,15,56,0,198
+	add	r9,64
+	add	ebx,DWORD[16+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+DB	102,15,56,0,206
+	rol	ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	ror	edx,7
+	paddd	xmm0,xmm9
+	add	ebx,ecx
+	add	eax,DWORD[20+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	movdqa	XMMWORD[rsp],xmm0
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,edx
+	ror	ecx,7
+	psubd	xmm0,xmm9
+	add	eax,ebx
+	add	ebp,DWORD[24+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	rol	eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[28+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[32+rsp]
+	xor	esi,eax
+	mov	edi,edx
+DB	102,15,56,0,214
+	rol	edx,5
+	add	ecx,esi
+	xor	edi,eax
+	ror	ebp,7
+	paddd	xmm1,xmm9
+	add	ecx,edx
+	add	ebx,DWORD[36+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	movdqa	XMMWORD[16+rsp],xmm1
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	ror	edx,7
+	psubd	xmm1,xmm9
+	add	ebx,ecx
+	add	eax,DWORD[40+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	rol	ebx,5
+	add	eax,esi
+	xor	edi,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[44+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[48+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+DB	102,15,56,0,222
+	rol	ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	ror	eax,7
+	paddd	xmm2,xmm9
+	add	edx,ebp
+	add	ecx,DWORD[52+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	movdqa	XMMWORD[32+rsp],xmm2
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,eax
+	ror	ebp,7
+	psubd	xmm2,xmm9
+	add	ecx,edx
+	add	ebx,DWORD[56+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[60+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,edi
+	ror	ecx,7
+	add	eax,ebx
+	add	eax,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ecx,DWORD[8+r8]
+	add	edx,DWORD[12+r8]
+	mov	DWORD[r8],eax
+	add	ebp,DWORD[16+r8]
+	mov	DWORD[4+r8],esi
+	mov	ebx,esi
+	mov	DWORD[8+r8],ecx
+	mov	edi,ecx
+	mov	DWORD[12+r8],edx
+	xor	edi,edx
+	mov	DWORD[16+r8],ebp
+	and	esi,edi
+	jmp	NEAR $L$oop_ssse3
+
+ALIGN	16
+$L$done_ssse3:
+	add	ebx,DWORD[16+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[20+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,edi
+	xor	esi,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[24+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	rol	eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[28+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	rol	ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[32+rsp]
+	xor	esi,eax
+	mov	edi,edx
+	rol	edx,5
+	add	ecx,esi
+	xor	edi,eax
+	ror	ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[36+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	rol	ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[40+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	rol	ebx,5
+	add	eax,esi
+	xor	edi,edx
+	ror	ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[44+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	rol	eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	ror	ebx,7
+	add	ebp,eax
+	add	edx,DWORD[48+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	rol	ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	ror	eax,7
+	add	edx,ebp
+	add	ecx,DWORD[52+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	rol	edx,5
+	add	ecx,edi
+	xor	esi,eax
+	ror	ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[56+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	rol	ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	ror	edx,7
+	add	ebx,ecx
+	add	eax,DWORD[60+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	rol	ebx,5
+	add	eax,edi
+	ror	ecx,7
+	add	eax,ebx
+	add	eax,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ecx,DWORD[8+r8]
+	mov	DWORD[r8],eax
+	add	edx,DWORD[12+r8]
+	mov	DWORD[4+r8],esi
+	add	ebp,DWORD[16+r8]
+	mov	DWORD[8+r8],ecx
+	mov	DWORD[12+r8],edx
+	mov	DWORD[16+r8],ebp
+	movaps	xmm6,XMMWORD[((-40-96))+r11]
+	movaps	xmm7,XMMWORD[((-40-80))+r11]
+	movaps	xmm8,XMMWORD[((-40-64))+r11]
+	movaps	xmm9,XMMWORD[((-40-48))+r11]
+	movaps	xmm10,XMMWORD[((-40-32))+r11]
+	movaps	xmm11,XMMWORD[((-40-16))+r11]
+	mov	r14,QWORD[((-40))+r11]
+
+	mov	r13,QWORD[((-32))+r11]
+
+	mov	r12,QWORD[((-24))+r11]
+
+	mov	rbp,QWORD[((-16))+r11]
+
+	mov	rbx,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$epilogue_ssse3:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha1_block_data_order_ssse3:
+global	sha1_block_data_order_avx
+
+ALIGN	16
+sha1_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	r11,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	lea	rsp,[((-160))+rsp]
+	vzeroupper
+	vmovaps	XMMWORD[(-40-96)+r11],xmm6
+	vmovaps	XMMWORD[(-40-80)+r11],xmm7
+	vmovaps	XMMWORD[(-40-64)+r11],xmm8
+	vmovaps	XMMWORD[(-40-48)+r11],xmm9
+	vmovaps	XMMWORD[(-40-32)+r11],xmm10
+	vmovaps	XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx:
+	and	rsp,-64
+	mov	r8,rdi
+	mov	r9,rsi
+	mov	r10,rdx
+
+	shl	r10,6
+	add	r10,r9
+	lea	r14,[((K_XX_XX+64))]
+
+	mov	eax,DWORD[r8]
+	mov	ebx,DWORD[4+r8]
+	mov	ecx,DWORD[8+r8]
+	mov	edx,DWORD[12+r8]
+	mov	esi,ebx
+	mov	ebp,DWORD[16+r8]
+	mov	edi,ecx
+	xor	edi,edx
+	and	esi,edi
+
+	vmovdqa	xmm6,XMMWORD[64+r14]
+	vmovdqa	xmm11,XMMWORD[((-64))+r14]
+	vmovdqu	xmm0,XMMWORD[r9]
+	vmovdqu	xmm1,XMMWORD[16+r9]
+	vmovdqu	xmm2,XMMWORD[32+r9]
+	vmovdqu	xmm3,XMMWORD[48+r9]
+	vpshufb	xmm0,xmm0,xmm6
+	add	r9,64
+	vpshufb	xmm1,xmm1,xmm6
+	vpshufb	xmm2,xmm2,xmm6
+	vpshufb	xmm3,xmm3,xmm6
+	vpaddd	xmm4,xmm0,xmm11
+	vpaddd	xmm5,xmm1,xmm11
+	vpaddd	xmm6,xmm2,xmm11
+	vmovdqa	XMMWORD[rsp],xmm4
+	vmovdqa	XMMWORD[16+rsp],xmm5
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	jmp	NEAR $L$oop_avx
+ALIGN	16
+$L$oop_avx:
+	shrd	ebx,ebx,2
+	xor	esi,edx
+	vpalignr	xmm4,xmm1,xmm0,8
+	mov	edi,eax
+	add	ebp,DWORD[rsp]
+	vpaddd	xmm9,xmm11,xmm3
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vpsrldq	xmm8,xmm3,4
+	add	ebp,esi
+	and	edi,ebx
+	vpxor	xmm4,xmm4,xmm0
+	xor	ebx,ecx
+	add	ebp,eax
+	vpxor	xmm8,xmm8,xmm2
+	shrd	eax,eax,7
+	xor	edi,ecx
+	mov	esi,ebp
+	add	edx,DWORD[4+rsp]
+	vpxor	xmm4,xmm4,xmm8
+	xor	eax,ebx
+	shld	ebp,ebp,5
+	vmovdqa	XMMWORD[48+rsp],xmm9
+	add	edx,edi
+	and	esi,eax
+	vpsrld	xmm8,xmm4,31
+	xor	eax,ebx
+	add	edx,ebp
+	shrd	ebp,ebp,7
+	xor	esi,ebx
+	vpslldq	xmm10,xmm4,12
+	vpaddd	xmm4,xmm4,xmm4
+	mov	edi,edx
+	add	ecx,DWORD[8+rsp]
+	xor	ebp,eax
+	shld	edx,edx,5
+	vpsrld	xmm9,xmm10,30
+	vpor	xmm4,xmm4,xmm8
+	add	ecx,esi
+	and	edi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	vpslld	xmm10,xmm10,2
+	vpxor	xmm4,xmm4,xmm9
+	shrd	edx,edx,7
+	xor	edi,eax
+	mov	esi,ecx
+	add	ebx,DWORD[12+rsp]
+	vpxor	xmm4,xmm4,xmm10
+	xor	edx,ebp
+	shld	ecx,ecx,5
+	add	ebx,edi
+	and	esi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	shrd	ecx,ecx,7
+	xor	esi,ebp
+	vpalignr	xmm5,xmm2,xmm1,8
+	mov	edi,ebx
+	add	eax,DWORD[16+rsp]
+	vpaddd	xmm9,xmm11,xmm4
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	vpsrldq	xmm8,xmm4,4
+	add	eax,esi
+	and	edi,ecx
+	vpxor	xmm5,xmm5,xmm1
+	xor	ecx,edx
+	add	eax,ebx
+	vpxor	xmm8,xmm8,xmm3
+	shrd	ebx,ebx,7
+	xor	edi,edx
+	mov	esi,eax
+	add	ebp,DWORD[20+rsp]
+	vpxor	xmm5,xmm5,xmm8
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vmovdqa	XMMWORD[rsp],xmm9
+	add	ebp,edi
+	and	esi,ebx
+	vpsrld	xmm8,xmm5,31
+	xor	ebx,ecx
+	add	ebp,eax
+	shrd	eax,eax,7
+	xor	esi,ecx
+	vpslldq	xmm10,xmm5,12
+	vpaddd	xmm5,xmm5,xmm5
+	mov	edi,ebp
+	add	edx,DWORD[24+rsp]
+	xor	eax,ebx
+	shld	ebp,ebp,5
+	vpsrld	xmm9,xmm10,30
+	vpor	xmm5,xmm5,xmm8
+	add	edx,esi
+	and	edi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	vpslld	xmm10,xmm10,2
+	vpxor	xmm5,xmm5,xmm9
+	shrd	ebp,ebp,7
+	xor	edi,ebx
+	mov	esi,edx
+	add	ecx,DWORD[28+rsp]
+	vpxor	xmm5,xmm5,xmm10
+	xor	ebp,eax
+	shld	edx,edx,5
+	vmovdqa	xmm11,XMMWORD[((-32))+r14]
+	add	ecx,edi
+	and	esi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	shrd	edx,edx,7
+	xor	esi,eax
+	vpalignr	xmm6,xmm3,xmm2,8
+	mov	edi,ecx
+	add	ebx,DWORD[32+rsp]
+	vpaddd	xmm9,xmm11,xmm5
+	xor	edx,ebp
+	shld	ecx,ecx,5
+	vpsrldq	xmm8,xmm5,4
+	add	ebx,esi
+	and	edi,edx
+	vpxor	xmm6,xmm6,xmm2
+	xor	edx,ebp
+	add	ebx,ecx
+	vpxor	xmm8,xmm8,xmm4
+	shrd	ecx,ecx,7
+	xor	edi,ebp
+	mov	esi,ebx
+	add	eax,DWORD[36+rsp]
+	vpxor	xmm6,xmm6,xmm8
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	add	eax,edi
+	and	esi,ecx
+	vpsrld	xmm8,xmm6,31
+	xor	ecx,edx
+	add	eax,ebx
+	shrd	ebx,ebx,7
+	xor	esi,edx
+	vpslldq	xmm10,xmm6,12
+	vpaddd	xmm6,xmm6,xmm6
+	mov	edi,eax
+	add	ebp,DWORD[40+rsp]
+	xor	ebx,ecx
+	shld	eax,eax,5
+	vpsrld	xmm9,xmm10,30
+	vpor	xmm6,xmm6,xmm8
+	add	ebp,esi
+	and	edi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	vpslld	xmm10,xmm10,2
+	vpxor	xmm6,xmm6,xmm9
+	shrd	eax,eax,7
+	xor	edi,ecx
+	mov	esi,ebp
+	add	edx,DWORD[44+rsp]
+	vpxor	xmm6,xmm6,xmm10
+	xor	eax,ebx
+	shld	ebp,ebp,5
+	add	edx,edi
+	and	esi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	shrd	ebp,ebp,7
+	xor	esi,ebx
+	vpalignr	xmm7,xmm4,xmm3,8
+	mov	edi,edx
+	add	ecx,DWORD[48+rsp]
+	vpaddd	xmm9,xmm11,xmm6
+	xor	ebp,eax
+	shld	edx,edx,5
+	vpsrldq	xmm8,xmm6,4
+	add	ecx,esi
+	and	edi,ebp
+	vpxor	xmm7,xmm7,xmm3
+	xor	ebp,eax
+	add	ecx,edx
+	vpxor	xmm8,xmm8,xmm5
+	shrd	edx,edx,7
+	xor	edi,eax
+	mov	esi,ecx
+	add	ebx,DWORD[52+rsp]
+	vpxor	xmm7,xmm7,xmm8
+	xor	edx,ebp
+	shld	ecx,ecx,5
+	vmovdqa	XMMWORD[32+rsp],xmm9
+	add	ebx,edi
+	and	esi,edx
+	vpsrld	xmm8,xmm7,31
+	xor	edx,ebp
+	add	ebx,ecx
+	shrd	ecx,ecx,7
+	xor	esi,ebp
+	vpslldq	xmm10,xmm7,12
+	vpaddd	xmm7,xmm7,xmm7
+	mov	edi,ebx
+	add	eax,DWORD[56+rsp]
+	xor	ecx,edx
+	shld	ebx,ebx,5
+	vpsrld	xmm9,xmm10,30
+	vpor	xmm7,xmm7,xmm8
+	add	eax,esi
+	and	edi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	vpslld	xmm10,xmm10,2
+	vpxor	xmm7,xmm7,xmm9
+	shrd	ebx,ebx,7
+	xor	edi,edx
+	mov	esi,eax
+	add	ebp,DWORD[60+rsp]
+	vpxor	xmm7,xmm7,xmm10
+	xor	ebx,ecx
+	shld	eax,eax,5
+	add	ebp,edi
+	and	esi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	vpalignr	xmm8,xmm7,xmm6,8
+	vpxor	xmm0,xmm0,xmm4
+	shrd	eax,eax,7
+	xor	esi,ecx
+	mov	edi,ebp
+	add	edx,DWORD[rsp]
+	vpxor	xmm0,xmm0,xmm1
+	xor	eax,ebx
+	shld	ebp,ebp,5
+	vpaddd	xmm9,xmm11,xmm7
+	add	edx,esi
+	and	edi,eax
+	vpxor	xmm0,xmm0,xmm8
+	xor	eax,ebx
+	add	edx,ebp
+	shrd	ebp,ebp,7
+	xor	edi,ebx
+	vpsrld	xmm8,xmm0,30
+	vmovdqa	XMMWORD[48+rsp],xmm9
+	mov	esi,edx
+	add	ecx,DWORD[4+rsp]
+	xor	ebp,eax
+	shld	edx,edx,5
+	vpslld	xmm0,xmm0,2
+	add	ecx,edi
+	and	esi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	shrd	edx,edx,7
+	xor	esi,eax
+	mov	edi,ecx
+	add	ebx,DWORD[8+rsp]
+	vpor	xmm0,xmm0,xmm8
+	xor	edx,ebp
+	shld	ecx,ecx,5
+	add	ebx,esi
+	and	edi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[12+rsp]
+	xor	edi,ebp
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpalignr	xmm8,xmm0,xmm7,8
+	vpxor	xmm1,xmm1,xmm5
+	add	ebp,DWORD[16+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	shld	eax,eax,5
+	vpxor	xmm1,xmm1,xmm2
+	add	ebp,esi
+	xor	edi,ecx
+	vpaddd	xmm9,xmm11,xmm0
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	vpxor	xmm1,xmm1,xmm8
+	add	edx,DWORD[20+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	shld	ebp,ebp,5
+	vpsrld	xmm8,xmm1,30
+	vmovdqa	XMMWORD[rsp],xmm9
+	add	edx,edi
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	vpslld	xmm1,xmm1,2
+	add	ecx,DWORD[24+rsp]
+	xor	esi,eax
+	mov	edi,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	edi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vpor	xmm1,xmm1,xmm8
+	add	ebx,DWORD[28+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpalignr	xmm8,xmm1,xmm0,8
+	vpxor	xmm2,xmm2,xmm6
+	add	eax,DWORD[32+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	shld	ebx,ebx,5
+	vpxor	xmm2,xmm2,xmm3
+	add	eax,esi
+	xor	edi,edx
+	vpaddd	xmm9,xmm11,xmm1
+	vmovdqa	xmm11,XMMWORD[r14]
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpxor	xmm2,xmm2,xmm8
+	add	ebp,DWORD[36+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	vpsrld	xmm8,xmm2,30
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	add	ebp,edi
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	vpslld	xmm2,xmm2,2
+	add	edx,DWORD[40+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	shld	ebp,ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	vpor	xmm2,xmm2,xmm8
+	add	ecx,DWORD[44+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vpalignr	xmm8,xmm2,xmm1,8
+	vpxor	xmm3,xmm3,xmm7
+	add	ebx,DWORD[48+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	vpxor	xmm3,xmm3,xmm4
+	add	ebx,esi
+	xor	edi,ebp
+	vpaddd	xmm9,xmm11,xmm2
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpxor	xmm3,xmm3,xmm8
+	add	eax,DWORD[52+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	vpsrld	xmm8,xmm3,30
+	vmovdqa	XMMWORD[32+rsp],xmm9
+	add	eax,edi
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpslld	xmm3,xmm3,2
+	add	ebp,DWORD[56+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	shld	eax,eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	vpor	xmm3,xmm3,xmm8
+	add	edx,DWORD[60+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	shld	ebp,ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	vpalignr	xmm8,xmm3,xmm2,8
+	vpxor	xmm4,xmm4,xmm0
+	add	ecx,DWORD[rsp]
+	xor	esi,eax
+	mov	edi,edx
+	shld	edx,edx,5
+	vpxor	xmm4,xmm4,xmm5
+	add	ecx,esi
+	xor	edi,eax
+	vpaddd	xmm9,xmm11,xmm3
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vpxor	xmm4,xmm4,xmm8
+	add	ebx,DWORD[4+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	vpsrld	xmm8,xmm4,30
+	vmovdqa	XMMWORD[48+rsp],xmm9
+	add	ebx,edi
+	xor	esi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpslld	xmm4,xmm4,2
+	add	eax,DWORD[8+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	edi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vpor	xmm4,xmm4,xmm8
+	add	ebp,DWORD[12+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	vpalignr	xmm8,xmm4,xmm3,8
+	vpxor	xmm5,xmm5,xmm1
+	add	edx,DWORD[16+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	shld	ebp,ebp,5
+	vpxor	xmm5,xmm5,xmm6
+	add	edx,esi
+	xor	edi,ebx
+	vpaddd	xmm9,xmm11,xmm4
+	shrd	eax,eax,7
+	add	edx,ebp
+	vpxor	xmm5,xmm5,xmm8
+	add	ecx,DWORD[20+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	vpsrld	xmm8,xmm5,30
+	vmovdqa	XMMWORD[rsp],xmm9
+	add	ecx,edi
+	xor	esi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vpslld	xmm5,xmm5,2
+	add	ebx,DWORD[24+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vpor	xmm5,xmm5,xmm8
+	add	eax,DWORD[28+rsp]
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	xor	edi,edx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	vpalignr	xmm8,xmm5,xmm4,8
+	vpxor	xmm6,xmm6,xmm2
+	add	ebp,DWORD[32+rsp]
+	and	esi,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	vpxor	xmm6,xmm6,xmm7
+	mov	edi,eax
+	xor	esi,ecx
+	vpaddd	xmm9,xmm11,xmm5
+	shld	eax,eax,5
+	add	ebp,esi
+	vpxor	xmm6,xmm6,xmm8
+	xor	edi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	add	edx,DWORD[36+rsp]
+	vpsrld	xmm8,xmm6,30
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	and	edi,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	mov	esi,ebp
+	vpslld	xmm6,xmm6,2
+	xor	edi,ebx
+	shld	ebp,ebp,5
+	add	edx,edi
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	add	ecx,DWORD[40+rsp]
+	and	esi,eax
+	vpor	xmm6,xmm6,xmm8
+	xor	eax,ebx
+	shrd	ebp,ebp,7
+	mov	edi,edx
+	xor	esi,eax
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	edi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	add	ebx,DWORD[44+rsp]
+	and	edi,ebp
+	xor	ebp,eax
+	shrd	edx,edx,7
+	mov	esi,ecx
+	xor	edi,ebp
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	vpalignr	xmm8,xmm6,xmm5,8
+	vpxor	xmm7,xmm7,xmm3
+	add	eax,DWORD[48+rsp]
+	and	esi,edx
+	xor	edx,ebp
+	shrd	ecx,ecx,7
+	vpxor	xmm7,xmm7,xmm0
+	mov	edi,ebx
+	xor	esi,edx
+	vpaddd	xmm9,xmm11,xmm6
+	vmovdqa	xmm11,XMMWORD[32+r14]
+	shld	ebx,ebx,5
+	add	eax,esi
+	vpxor	xmm7,xmm7,xmm8
+	xor	edi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	ebp,DWORD[52+rsp]
+	vpsrld	xmm8,xmm7,30
+	vmovdqa	XMMWORD[32+rsp],xmm9
+	and	edi,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	mov	esi,eax
+	vpslld	xmm7,xmm7,2
+	xor	edi,ecx
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	add	edx,DWORD[56+rsp]
+	and	esi,ebx
+	vpor	xmm7,xmm7,xmm8
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	mov	edi,ebp
+	xor	esi,ebx
+	shld	ebp,ebp,5
+	add	edx,esi
+	xor	edi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	add	ecx,DWORD[60+rsp]
+	and	edi,eax
+	xor	eax,ebx
+	shrd	ebp,ebp,7
+	mov	esi,edx
+	xor	edi,eax
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	vpalignr	xmm8,xmm7,xmm6,8
+	vpxor	xmm0,xmm0,xmm4
+	add	ebx,DWORD[rsp]
+	and	esi,ebp
+	xor	ebp,eax
+	shrd	edx,edx,7
+	vpxor	xmm0,xmm0,xmm1
+	mov	edi,ecx
+	xor	esi,ebp
+	vpaddd	xmm9,xmm11,xmm7
+	shld	ecx,ecx,5
+	add	ebx,esi
+	vpxor	xmm0,xmm0,xmm8
+	xor	edi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[4+rsp]
+	vpsrld	xmm8,xmm0,30
+	vmovdqa	XMMWORD[48+rsp],xmm9
+	and	edi,edx
+	xor	edx,ebp
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	vpslld	xmm0,xmm0,2
+	xor	edi,edx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	ebp,DWORD[8+rsp]
+	and	esi,ecx
+	vpor	xmm0,xmm0,xmm8
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	mov	edi,eax
+	xor	esi,ecx
+	shld	eax,eax,5
+	add	ebp,esi
+	xor	edi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	add	edx,DWORD[12+rsp]
+	and	edi,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	mov	esi,ebp
+	xor	edi,ebx
+	shld	ebp,ebp,5
+	add	edx,edi
+	xor	esi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	vpalignr	xmm8,xmm0,xmm7,8
+	vpxor	xmm1,xmm1,xmm5
+	add	ecx,DWORD[16+rsp]
+	and	esi,eax
+	xor	eax,ebx
+	shrd	ebp,ebp,7
+	vpxor	xmm1,xmm1,xmm2
+	mov	edi,edx
+	xor	esi,eax
+	vpaddd	xmm9,xmm11,xmm0
+	shld	edx,edx,5
+	add	ecx,esi
+	vpxor	xmm1,xmm1,xmm8
+	xor	edi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	add	ebx,DWORD[20+rsp]
+	vpsrld	xmm8,xmm1,30
+	vmovdqa	XMMWORD[rsp],xmm9
+	and	edi,ebp
+	xor	ebp,eax
+	shrd	edx,edx,7
+	mov	esi,ecx
+	vpslld	xmm1,xmm1,2
+	xor	edi,ebp
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[24+rsp]
+	and	esi,edx
+	vpor	xmm1,xmm1,xmm8
+	xor	edx,ebp
+	shrd	ecx,ecx,7
+	mov	edi,ebx
+	xor	esi,edx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	edi,ecx
+	xor	ecx,edx
+	add	eax,ebx
+	add	ebp,DWORD[28+rsp]
+	and	edi,ecx
+	xor	ecx,edx
+	shrd	ebx,ebx,7
+	mov	esi,eax
+	xor	edi,ecx
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ebx
+	xor	ebx,ecx
+	add	ebp,eax
+	vpalignr	xmm8,xmm1,xmm0,8
+	vpxor	xmm2,xmm2,xmm6
+	add	edx,DWORD[32+rsp]
+	and	esi,ebx
+	xor	ebx,ecx
+	shrd	eax,eax,7
+	vpxor	xmm2,xmm2,xmm3
+	mov	edi,ebp
+	xor	esi,ebx
+	vpaddd	xmm9,xmm11,xmm1
+	shld	ebp,ebp,5
+	add	edx,esi
+	vpxor	xmm2,xmm2,xmm8
+	xor	edi,eax
+	xor	eax,ebx
+	add	edx,ebp
+	add	ecx,DWORD[36+rsp]
+	vpsrld	xmm8,xmm2,30
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	and	edi,eax
+	xor	eax,ebx
+	shrd	ebp,ebp,7
+	mov	esi,edx
+	vpslld	xmm2,xmm2,2
+	xor	edi,eax
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,ebp
+	xor	ebp,eax
+	add	ecx,edx
+	add	ebx,DWORD[40+rsp]
+	and	esi,ebp
+	vpor	xmm2,xmm2,xmm8
+	xor	ebp,eax
+	shrd	edx,edx,7
+	mov	edi,ecx
+	xor	esi,ebp
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	edi,edx
+	xor	edx,ebp
+	add	ebx,ecx
+	add	eax,DWORD[44+rsp]
+	and	edi,edx
+	xor	edx,ebp
+	shrd	ecx,ecx,7
+	mov	esi,ebx
+	xor	edi,edx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,edx
+	add	eax,ebx
+	vpalignr	xmm8,xmm2,xmm1,8
+	vpxor	xmm3,xmm3,xmm7
+	add	ebp,DWORD[48+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	shld	eax,eax,5
+	vpxor	xmm3,xmm3,xmm4
+	add	ebp,esi
+	xor	edi,ecx
+	vpaddd	xmm9,xmm11,xmm2
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	vpxor	xmm3,xmm3,xmm8
+	add	edx,DWORD[52+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	shld	ebp,ebp,5
+	vpsrld	xmm8,xmm3,30
+	vmovdqa	XMMWORD[32+rsp],xmm9
+	add	edx,edi
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	vpslld	xmm3,xmm3,2
+	add	ecx,DWORD[56+rsp]
+	xor	esi,eax
+	mov	edi,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	edi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vpor	xmm3,xmm3,xmm8
+	add	ebx,DWORD[60+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[rsp]
+	vpaddd	xmm9,xmm11,xmm3
+	xor	esi,edx
+	mov	edi,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	vmovdqa	XMMWORD[48+rsp],xmm9
+	xor	edi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[4+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	add	edx,DWORD[8+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	shld	ebp,ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	add	ecx,DWORD[12+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	cmp	r9,r10
+	je	NEAR $L$done_avx
+	vmovdqa	xmm6,XMMWORD[64+r14]
+	vmovdqa	xmm11,XMMWORD[((-64))+r14]
+	vmovdqu	xmm0,XMMWORD[r9]
+	vmovdqu	xmm1,XMMWORD[16+r9]
+	vmovdqu	xmm2,XMMWORD[32+r9]
+	vmovdqu	xmm3,XMMWORD[48+r9]
+	vpshufb	xmm0,xmm0,xmm6
+	add	r9,64
+	add	ebx,DWORD[16+rsp]
+	xor	esi,ebp
+	vpshufb	xmm1,xmm1,xmm6
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	vpaddd	xmm4,xmm0,xmm11
+	add	ebx,esi
+	xor	edi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	vmovdqa	XMMWORD[rsp],xmm4
+	add	eax,DWORD[20+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[24+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	shld	eax,eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	add	edx,DWORD[28+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	shld	ebp,ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	add	ecx,DWORD[32+rsp]
+	xor	esi,eax
+	vpshufb	xmm2,xmm2,xmm6
+	mov	edi,edx
+	shld	edx,edx,5
+	vpaddd	xmm5,xmm1,xmm11
+	add	ecx,esi
+	xor	edi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	vmovdqa	XMMWORD[16+rsp],xmm5
+	add	ebx,DWORD[36+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[40+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	edi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[44+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	add	edx,DWORD[48+rsp]
+	xor	esi,ebx
+	vpshufb	xmm3,xmm3,xmm6
+	mov	edi,ebp
+	shld	ebp,ebp,5
+	vpaddd	xmm6,xmm2,xmm11
+	add	edx,esi
+	xor	edi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	add	ecx,DWORD[52+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[56+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[60+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,edi
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	eax,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ecx,DWORD[8+r8]
+	add	edx,DWORD[12+r8]
+	mov	DWORD[r8],eax
+	add	ebp,DWORD[16+r8]
+	mov	DWORD[4+r8],esi
+	mov	ebx,esi
+	mov	DWORD[8+r8],ecx
+	mov	edi,ecx
+	mov	DWORD[12+r8],edx
+	xor	edi,edx
+	mov	DWORD[16+r8],ebp
+	and	esi,edi
+	jmp	NEAR $L$oop_avx
+
+ALIGN	16
+$L$done_avx:
+	add	ebx,DWORD[16+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[20+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,edi
+	xor	esi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[24+rsp]
+	xor	esi,ecx
+	mov	edi,eax
+	shld	eax,eax,5
+	add	ebp,esi
+	xor	edi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	add	edx,DWORD[28+rsp]
+	xor	edi,ebx
+	mov	esi,ebp
+	shld	ebp,ebp,5
+	add	edx,edi
+	xor	esi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	add	ecx,DWORD[32+rsp]
+	xor	esi,eax
+	mov	edi,edx
+	shld	edx,edx,5
+	add	ecx,esi
+	xor	edi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[36+rsp]
+	xor	edi,ebp
+	mov	esi,ecx
+	shld	ecx,ecx,5
+	add	ebx,edi
+	xor	esi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[40+rsp]
+	xor	esi,edx
+	mov	edi,ebx
+	shld	ebx,ebx,5
+	add	eax,esi
+	xor	edi,edx
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	add	ebp,DWORD[44+rsp]
+	xor	edi,ecx
+	mov	esi,eax
+	shld	eax,eax,5
+	add	ebp,edi
+	xor	esi,ecx
+	shrd	ebx,ebx,7
+	add	ebp,eax
+	add	edx,DWORD[48+rsp]
+	xor	esi,ebx
+	mov	edi,ebp
+	shld	ebp,ebp,5
+	add	edx,esi
+	xor	edi,ebx
+	shrd	eax,eax,7
+	add	edx,ebp
+	add	ecx,DWORD[52+rsp]
+	xor	edi,eax
+	mov	esi,edx
+	shld	edx,edx,5
+	add	ecx,edi
+	xor	esi,eax
+	shrd	ebp,ebp,7
+	add	ecx,edx
+	add	ebx,DWORD[56+rsp]
+	xor	esi,ebp
+	mov	edi,ecx
+	shld	ecx,ecx,5
+	add	ebx,esi
+	xor	edi,ebp
+	shrd	edx,edx,7
+	add	ebx,ecx
+	add	eax,DWORD[60+rsp]
+	xor	edi,edx
+	mov	esi,ebx
+	shld	ebx,ebx,5
+	add	eax,edi
+	shrd	ecx,ecx,7
+	add	eax,ebx
+	vzeroupper
+
+	add	eax,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ecx,DWORD[8+r8]
+	mov	DWORD[r8],eax
+	add	edx,DWORD[12+r8]
+	mov	DWORD[4+r8],esi
+	add	ebp,DWORD[16+r8]
+	mov	DWORD[8+r8],ecx
+	mov	DWORD[12+r8],edx
+	mov	DWORD[16+r8],ebp
+	movaps	xmm6,XMMWORD[((-40-96))+r11]
+	movaps	xmm7,XMMWORD[((-40-80))+r11]
+	movaps	xmm8,XMMWORD[((-40-64))+r11]
+	movaps	xmm9,XMMWORD[((-40-48))+r11]
+	movaps	xmm10,XMMWORD[((-40-32))+r11]
+	movaps	xmm11,XMMWORD[((-40-16))+r11]
+	mov	r14,QWORD[((-40))+r11]
+
+	mov	r13,QWORD[((-32))+r11]
+
+	mov	r12,QWORD[((-24))+r11]
+
+	mov	rbp,QWORD[((-16))+r11]
+
+	mov	rbx,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha1_block_data_order_avx:
+global	sha1_block_data_order_avx2
+
+ALIGN	16
+sha1_block_data_order_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	r11,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	vzeroupper
+	lea	rsp,[((-96))+rsp]
+	vmovaps	XMMWORD[(-40-96)+r11],xmm6
+	vmovaps	XMMWORD[(-40-80)+r11],xmm7
+	vmovaps	XMMWORD[(-40-64)+r11],xmm8
+	vmovaps	XMMWORD[(-40-48)+r11],xmm9
+	vmovaps	XMMWORD[(-40-32)+r11],xmm10
+	vmovaps	XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx2:
+	mov	r8,rdi
+	mov	r9,rsi
+	mov	r10,rdx
+
+	lea	rsp,[((-640))+rsp]
+	shl	r10,6
+	lea	r13,[64+r9]
+	and	rsp,-128
+	add	r10,r9
+	lea	r14,[((K_XX_XX+64))]
+
+	mov	eax,DWORD[r8]
+	cmp	r13,r10
+	cmovae	r13,r9
+	mov	ebp,DWORD[4+r8]
+	mov	ecx,DWORD[8+r8]
+	mov	edx,DWORD[12+r8]
+	mov	esi,DWORD[16+r8]
+	vmovdqu	ymm6,YMMWORD[64+r14]
+
+	vmovdqu	xmm0,XMMWORD[r9]
+	vmovdqu	xmm1,XMMWORD[16+r9]
+	vmovdqu	xmm2,XMMWORD[32+r9]
+	vmovdqu	xmm3,XMMWORD[48+r9]
+	lea	r9,[64+r9]
+	vinserti128	ymm0,ymm0,XMMWORD[r13],1
+	vinserti128	ymm1,ymm1,XMMWORD[16+r13],1
+	vpshufb	ymm0,ymm0,ymm6
+	vinserti128	ymm2,ymm2,XMMWORD[32+r13],1
+	vpshufb	ymm1,ymm1,ymm6
+	vinserti128	ymm3,ymm3,XMMWORD[48+r13],1
+	vpshufb	ymm2,ymm2,ymm6
+	vmovdqu	ymm11,YMMWORD[((-64))+r14]
+	vpshufb	ymm3,ymm3,ymm6
+
+	vpaddd	ymm4,ymm0,ymm11
+	vpaddd	ymm5,ymm1,ymm11
+	vmovdqu	YMMWORD[rsp],ymm4
+	vpaddd	ymm6,ymm2,ymm11
+	vmovdqu	YMMWORD[32+rsp],ymm5
+	vpaddd	ymm7,ymm3,ymm11
+	vmovdqu	YMMWORD[64+rsp],ymm6
+	vmovdqu	YMMWORD[96+rsp],ymm7
+	vpalignr	ymm4,ymm1,ymm0,8
+	vpsrldq	ymm8,ymm3,4
+	vpxor	ymm4,ymm4,ymm0
+	vpxor	ymm8,ymm8,ymm2
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm8,ymm4,31
+	vpslldq	ymm10,ymm4,12
+	vpaddd	ymm4,ymm4,ymm4
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm4,ymm4,ymm8
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm4,ymm4,ymm9
+	vpxor	ymm4,ymm4,ymm10
+	vpaddd	ymm9,ymm4,ymm11
+	vmovdqu	YMMWORD[128+rsp],ymm9
+	vpalignr	ymm5,ymm2,ymm1,8
+	vpsrldq	ymm8,ymm4,4
+	vpxor	ymm5,ymm5,ymm1
+	vpxor	ymm8,ymm8,ymm3
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm5,31
+	vmovdqu	ymm11,YMMWORD[((-32))+r14]
+	vpslldq	ymm10,ymm5,12
+	vpaddd	ymm5,ymm5,ymm5
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm5,ymm5,ymm8
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm5,ymm5,ymm10
+	vpaddd	ymm9,ymm5,ymm11
+	vmovdqu	YMMWORD[160+rsp],ymm9
+	vpalignr	ymm6,ymm3,ymm2,8
+	vpsrldq	ymm8,ymm5,4
+	vpxor	ymm6,ymm6,ymm2
+	vpxor	ymm8,ymm8,ymm4
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm6,31
+	vpslldq	ymm10,ymm6,12
+	vpaddd	ymm6,ymm6,ymm6
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm6,ymm6,ymm8
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm6,ymm6,ymm9
+	vpxor	ymm6,ymm6,ymm10
+	vpaddd	ymm9,ymm6,ymm11
+	vmovdqu	YMMWORD[192+rsp],ymm9
+	vpalignr	ymm7,ymm4,ymm3,8
+	vpsrldq	ymm8,ymm6,4
+	vpxor	ymm7,ymm7,ymm3
+	vpxor	ymm8,ymm8,ymm5
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm7,31
+	vpslldq	ymm10,ymm7,12
+	vpaddd	ymm7,ymm7,ymm7
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm7,ymm7,ymm8
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm7,ymm7,ymm9
+	vpxor	ymm7,ymm7,ymm10
+	vpaddd	ymm9,ymm7,ymm11
+	vmovdqu	YMMWORD[224+rsp],ymm9
+	lea	r13,[128+rsp]
+	jmp	NEAR $L$oop_avx2
+ALIGN	32
+$L$oop_avx2:
+	rorx	ebx,ebp,2
+	andn	edi,ebp,edx
+	and	ebp,ecx
+	xor	ebp,edi
+	jmp	NEAR $L$align32_1
+ALIGN	32
+$L$align32_1:
+	vpalignr	ymm8,ymm7,ymm6,8
+	vpxor	ymm0,ymm0,ymm4
+	add	esi,DWORD[((-128))+r13]
+	andn	edi,eax,ecx
+	vpxor	ymm0,ymm0,ymm1
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	vpxor	ymm0,ymm0,ymm8
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	vpsrld	ymm8,ymm0,30
+	vpslld	ymm0,ymm0,2
+	add	edx,DWORD[((-124))+r13]
+	andn	edi,esi,ebx
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	vpor	ymm0,ymm0,ymm8
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-120))+r13]
+	andn	edi,edx,ebp
+	vpaddd	ymm9,ymm0,ymm11
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	vmovdqu	YMMWORD[256+rsp],ymm9
+	add	ecx,r12d
+	xor	edx,edi
+	add	ebx,DWORD[((-116))+r13]
+	andn	edi,ecx,eax
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	add	ebp,DWORD[((-96))+r13]
+	andn	edi,ebx,esi
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	and	ebx,edx
+	add	ebp,r12d
+	xor	ebx,edi
+	vpalignr	ymm8,ymm0,ymm7,8
+	vpxor	ymm1,ymm1,ymm5
+	add	eax,DWORD[((-92))+r13]
+	andn	edi,ebp,edx
+	vpxor	ymm1,ymm1,ymm2
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	vpxor	ymm1,ymm1,ymm8
+	and	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edi
+	vpsrld	ymm8,ymm1,30
+	vpslld	ymm1,ymm1,2
+	add	esi,DWORD[((-88))+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	vpor	ymm1,ymm1,ymm8
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[((-84))+r13]
+	andn	edi,esi,ebx
+	vpaddd	ymm9,ymm1,ymm11
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	vmovdqu	YMMWORD[288+rsp],ymm9
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-64))+r13]
+	andn	edi,edx,ebp
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	add	ecx,r12d
+	xor	edx,edi
+	add	ebx,DWORD[((-60))+r13]
+	andn	edi,ecx,eax
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	vpalignr	ymm8,ymm1,ymm0,8
+	vpxor	ymm2,ymm2,ymm6
+	add	ebp,DWORD[((-56))+r13]
+	andn	edi,ebx,esi
+	vpxor	ymm2,ymm2,ymm3
+	vmovdqu	ymm11,YMMWORD[r14]
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	vpxor	ymm2,ymm2,ymm8
+	and	ebx,edx
+	add	ebp,r12d
+	xor	ebx,edi
+	vpsrld	ymm8,ymm2,30
+	vpslld	ymm2,ymm2,2
+	add	eax,DWORD[((-52))+r13]
+	andn	edi,ebp,edx
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	and	ebp,ecx
+	vpor	ymm2,ymm2,ymm8
+	add	eax,r12d
+	xor	ebp,edi
+	add	esi,DWORD[((-32))+r13]
+	andn	edi,eax,ecx
+	vpaddd	ymm9,ymm2,ymm11
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	vmovdqu	YMMWORD[320+rsp],ymm9
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[((-28))+r13]
+	andn	edi,esi,ebx
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-24))+r13]
+	andn	edi,edx,ebp
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	add	ecx,r12d
+	xor	edx,edi
+	vpalignr	ymm8,ymm2,ymm1,8
+	vpxor	ymm3,ymm3,ymm7
+	add	ebx,DWORD[((-20))+r13]
+	andn	edi,ecx,eax
+	vpxor	ymm3,ymm3,ymm4
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	vpxor	ymm3,ymm3,ymm8
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	vpsrld	ymm8,ymm3,30
+	vpslld	ymm3,ymm3,2
+	add	ebp,DWORD[r13]
+	andn	edi,ebx,esi
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	and	ebx,edx
+	vpor	ymm3,ymm3,ymm8
+	add	ebp,r12d
+	xor	ebx,edi
+	add	eax,DWORD[4+r13]
+	andn	edi,ebp,edx
+	vpaddd	ymm9,ymm3,ymm11
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	and	ebp,ecx
+	vmovdqu	YMMWORD[352+rsp],ymm9
+	add	eax,r12d
+	xor	ebp,edi
+	add	esi,DWORD[8+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[12+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	vpalignr	ymm8,ymm3,ymm2,8
+	vpxor	ymm4,ymm4,ymm0
+	add	ecx,DWORD[32+r13]
+	lea	ecx,[rsi*1+rcx]
+	vpxor	ymm4,ymm4,ymm5
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	vpxor	ymm4,ymm4,ymm8
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[36+r13]
+	vpsrld	ymm8,ymm4,30
+	vpslld	ymm4,ymm4,2
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	vpor	ymm4,ymm4,ymm8
+	add	ebp,DWORD[40+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	vpaddd	ymm9,ymm4,ymm11
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[44+r13]
+	vmovdqu	YMMWORD[384+rsp],ymm9
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[64+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	vpalignr	ymm8,ymm4,ymm3,8
+	vpxor	ymm5,ymm5,ymm1
+	add	edx,DWORD[68+r13]
+	lea	edx,[rax*1+rdx]
+	vpxor	ymm5,ymm5,ymm6
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	vpxor	ymm5,ymm5,ymm8
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[72+r13]
+	vpsrld	ymm8,ymm5,30
+	vpslld	ymm5,ymm5,2
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	vpor	ymm5,ymm5,ymm8
+	add	ebx,DWORD[76+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	vpaddd	ymm9,ymm5,ymm11
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[96+r13]
+	vmovdqu	YMMWORD[416+rsp],ymm9
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[100+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	vpalignr	ymm8,ymm5,ymm4,8
+	vpxor	ymm6,ymm6,ymm2
+	add	esi,DWORD[104+r13]
+	lea	esi,[rbp*1+rsi]
+	vpxor	ymm6,ymm6,ymm7
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	vpxor	ymm6,ymm6,ymm8
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[108+r13]
+	lea	r13,[256+r13]
+	vpsrld	ymm8,ymm6,30
+	vpslld	ymm6,ymm6,2
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	vpor	ymm6,ymm6,ymm8
+	add	ecx,DWORD[((-128))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	vpaddd	ymm9,ymm6,ymm11
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-124))+r13]
+	vmovdqu	YMMWORD[448+rsp],ymm9
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-120))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	vpalignr	ymm8,ymm6,ymm5,8
+	vpxor	ymm7,ymm7,ymm3
+	add	eax,DWORD[((-116))+r13]
+	lea	eax,[rbx*1+rax]
+	vpxor	ymm7,ymm7,ymm0
+	vmovdqu	ymm11,YMMWORD[32+r14]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	vpxor	ymm7,ymm7,ymm8
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-96))+r13]
+	vpsrld	ymm8,ymm7,30
+	vpslld	ymm7,ymm7,2
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	vpor	ymm7,ymm7,ymm8
+	add	edx,DWORD[((-92))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	vpaddd	ymm9,ymm7,ymm11
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[((-88))+r13]
+	vmovdqu	YMMWORD[480+rsp],ymm9
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-84))+r13]
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	and	ecx,edi
+	jmp	NEAR $L$align32_2
+ALIGN	32
+$L$align32_2:
+	vpalignr	ymm8,ymm7,ymm6,8
+	vpxor	ymm0,ymm0,ymm4
+	add	ebp,DWORD[((-64))+r13]
+	xor	ecx,esi
+	vpxor	ymm0,ymm0,ymm1
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	vpxor	ymm0,ymm0,ymm8
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	vpsrld	ymm8,ymm0,30
+	vpslld	ymm0,ymm0,2
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[((-60))+r13]
+	xor	ebx,edx
+	mov	edi,ecx
+	xor	edi,edx
+	vpor	ymm0,ymm0,ymm8
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	vpaddd	ymm9,ymm0,ymm11
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[((-56))+r13]
+	xor	ebp,ecx
+	vmovdqu	YMMWORD[512+rsp],ymm9
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	add	edx,DWORD[((-52))+r13]
+	xor	eax,ebx
+	mov	edi,ebp
+	xor	edi,ebx
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	and	esi,edi
+	add	ecx,DWORD[((-32))+r13]
+	xor	esi,ebp
+	mov	edi,eax
+	xor	edi,ebp
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	and	edx,edi
+	vpalignr	ymm8,ymm0,ymm7,8
+	vpxor	ymm1,ymm1,ymm5
+	add	ebx,DWORD[((-28))+r13]
+	xor	edx,eax
+	vpxor	ymm1,ymm1,ymm2
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	vpxor	ymm1,ymm1,ymm8
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	vpsrld	ymm8,ymm1,30
+	vpslld	ymm1,ymm1,2
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[((-24))+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	vpor	ymm1,ymm1,ymm8
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	vpaddd	ymm9,ymm1,ymm11
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[((-20))+r13]
+	xor	ebx,edx
+	vmovdqu	YMMWORD[544+rsp],ymm9
+	mov	edi,ecx
+	xor	edi,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[r13]
+	xor	ebp,ecx
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	add	edx,DWORD[4+r13]
+	xor	eax,ebx
+	mov	edi,ebp
+	xor	edi,ebx
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	and	esi,edi
+	vpalignr	ymm8,ymm1,ymm0,8
+	vpxor	ymm2,ymm2,ymm6
+	add	ecx,DWORD[8+r13]
+	xor	esi,ebp
+	vpxor	ymm2,ymm2,ymm3
+	mov	edi,eax
+	xor	edi,ebp
+	lea	ecx,[rsi*1+rcx]
+	vpxor	ymm2,ymm2,ymm8
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	vpsrld	ymm8,ymm2,30
+	vpslld	ymm2,ymm2,2
+	add	ecx,r12d
+	and	edx,edi
+	add	ebx,DWORD[12+r13]
+	xor	edx,eax
+	mov	edi,esi
+	xor	edi,eax
+	vpor	ymm2,ymm2,ymm8
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	vpaddd	ymm9,ymm2,ymm11
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[32+r13]
+	xor	ecx,esi
+	vmovdqu	YMMWORD[576+rsp],ymm9
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[36+r13]
+	xor	ebx,edx
+	mov	edi,ecx
+	xor	edi,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[40+r13]
+	xor	ebp,ecx
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	vpalignr	ymm8,ymm2,ymm1,8
+	vpxor	ymm3,ymm3,ymm7
+	add	edx,DWORD[44+r13]
+	xor	eax,ebx
+	vpxor	ymm3,ymm3,ymm4
+	mov	edi,ebp
+	xor	edi,ebx
+	lea	edx,[rax*1+rdx]
+	vpxor	ymm3,ymm3,ymm8
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	vpsrld	ymm8,ymm3,30
+	vpslld	ymm3,ymm3,2
+	add	edx,r12d
+	and	esi,edi
+	add	ecx,DWORD[64+r13]
+	xor	esi,ebp
+	mov	edi,eax
+	xor	edi,ebp
+	vpor	ymm3,ymm3,ymm8
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	vpaddd	ymm9,ymm3,ymm11
+	add	ecx,r12d
+	and	edx,edi
+	add	ebx,DWORD[68+r13]
+	xor	edx,eax
+	vmovdqu	YMMWORD[608+rsp],ymm9
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[72+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[76+r13]
+	xor	ebx,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[96+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[100+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[104+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[108+r13]
+	lea	r13,[256+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-128))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[((-124))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-120))+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[((-116))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[((-96))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-92))+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-88))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[((-84))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-64))+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[((-60))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[((-56))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-52))+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-32))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[((-28))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-24))+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[((-20))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	add	edx,r12d
+	lea	r13,[128+r9]
+	lea	rdi,[128+r9]
+	cmp	r13,r10
+	cmovae	r13,r9
+
+
+	add	edx,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ebp,DWORD[8+r8]
+	mov	DWORD[r8],edx
+	add	ebx,DWORD[12+r8]
+	mov	DWORD[4+r8],esi
+	mov	eax,edx
+	add	ecx,DWORD[16+r8]
+	mov	r12d,ebp
+	mov	DWORD[8+r8],ebp
+	mov	edx,ebx
+
+	mov	DWORD[12+r8],ebx
+	mov	ebp,esi
+	mov	DWORD[16+r8],ecx
+
+	mov	esi,ecx
+	mov	ecx,r12d
+
+
+	cmp	r9,r10
+	je	NEAR $L$done_avx2
+	vmovdqu	ymm6,YMMWORD[64+r14]
+	cmp	rdi,r10
+	ja	NEAR $L$ast_avx2
+
+	vmovdqu	xmm0,XMMWORD[((-64))+rdi]
+	vmovdqu	xmm1,XMMWORD[((-48))+rdi]
+	vmovdqu	xmm2,XMMWORD[((-32))+rdi]
+	vmovdqu	xmm3,XMMWORD[((-16))+rdi]
+	vinserti128	ymm0,ymm0,XMMWORD[r13],1
+	vinserti128	ymm1,ymm1,XMMWORD[16+r13],1
+	vinserti128	ymm2,ymm2,XMMWORD[32+r13],1
+	vinserti128	ymm3,ymm3,XMMWORD[48+r13],1
+	jmp	NEAR $L$ast_avx2
+
+ALIGN	32
+$L$ast_avx2:
+	lea	r13,[((128+16))+rsp]
+	rorx	ebx,ebp,2
+	andn	edi,ebp,edx
+	and	ebp,ecx
+	xor	ebp,edi
+	sub	r9,-128
+	add	esi,DWORD[((-128))+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[((-124))+r13]
+	andn	edi,esi,ebx
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-120))+r13]
+	andn	edi,edx,ebp
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	add	ecx,r12d
+	xor	edx,edi
+	add	ebx,DWORD[((-116))+r13]
+	andn	edi,ecx,eax
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	add	ebp,DWORD[((-96))+r13]
+	andn	edi,ebx,esi
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	and	ebx,edx
+	add	ebp,r12d
+	xor	ebx,edi
+	add	eax,DWORD[((-92))+r13]
+	andn	edi,ebp,edx
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	and	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edi
+	add	esi,DWORD[((-88))+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[((-84))+r13]
+	andn	edi,esi,ebx
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-64))+r13]
+	andn	edi,edx,ebp
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	add	ecx,r12d
+	xor	edx,edi
+	add	ebx,DWORD[((-60))+r13]
+	andn	edi,ecx,eax
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	add	ebp,DWORD[((-56))+r13]
+	andn	edi,ebx,esi
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	and	ebx,edx
+	add	ebp,r12d
+	xor	ebx,edi
+	add	eax,DWORD[((-52))+r13]
+	andn	edi,ebp,edx
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	and	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edi
+	add	esi,DWORD[((-32))+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[((-28))+r13]
+	andn	edi,esi,ebx
+	add	edx,eax
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	and	esi,ebp
+	add	edx,r12d
+	xor	esi,edi
+	add	ecx,DWORD[((-24))+r13]
+	andn	edi,edx,ebp
+	add	ecx,esi
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	and	edx,eax
+	add	ecx,r12d
+	xor	edx,edi
+	add	ebx,DWORD[((-20))+r13]
+	andn	edi,ecx,eax
+	add	ebx,edx
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	and	ecx,esi
+	add	ebx,r12d
+	xor	ecx,edi
+	add	ebp,DWORD[r13]
+	andn	edi,ebx,esi
+	add	ebp,ecx
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	and	ebx,edx
+	add	ebp,r12d
+	xor	ebx,edi
+	add	eax,DWORD[4+r13]
+	andn	edi,ebp,edx
+	add	eax,ebx
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	and	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edi
+	add	esi,DWORD[8+r13]
+	andn	edi,eax,ecx
+	add	esi,ebp
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	and	eax,ebx
+	add	esi,r12d
+	xor	eax,edi
+	add	edx,DWORD[12+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[32+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[36+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[40+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[44+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[64+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	vmovdqu	ymm11,YMMWORD[((-64))+r14]
+	vpshufb	ymm0,ymm0,ymm6
+	add	edx,DWORD[68+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[72+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[76+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[96+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[100+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	vpshufb	ymm1,ymm1,ymm6
+	vpaddd	ymm8,ymm0,ymm11
+	add	esi,DWORD[104+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[108+r13]
+	lea	r13,[256+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[((-128))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-124))+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-120))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	vmovdqu	YMMWORD[rsp],ymm8
+	vpshufb	ymm2,ymm2,ymm6
+	vpaddd	ymm9,ymm1,ymm11
+	add	eax,DWORD[((-116))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-96))+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[((-92))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	add	ecx,DWORD[((-88))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-84))+r13]
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	and	ecx,edi
+	vmovdqu	YMMWORD[32+rsp],ymm9
+	vpshufb	ymm3,ymm3,ymm6
+	vpaddd	ymm6,ymm2,ymm11
+	add	ebp,DWORD[((-64))+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[((-60))+r13]
+	xor	ebx,edx
+	mov	edi,ecx
+	xor	edi,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[((-56))+r13]
+	xor	ebp,ecx
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	add	edx,DWORD[((-52))+r13]
+	xor	eax,ebx
+	mov	edi,ebp
+	xor	edi,ebx
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	and	esi,edi
+	add	ecx,DWORD[((-32))+r13]
+	xor	esi,ebp
+	mov	edi,eax
+	xor	edi,ebp
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	and	edx,edi
+	jmp	NEAR $L$align32_3
+ALIGN	32
+$L$align32_3:
+	vmovdqu	YMMWORD[64+rsp],ymm6
+	vpaddd	ymm7,ymm3,ymm11
+	add	ebx,DWORD[((-28))+r13]
+	xor	edx,eax
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[((-24))+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[((-20))+r13]
+	xor	ebx,edx
+	mov	edi,ecx
+	xor	edi,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[r13]
+	xor	ebp,ecx
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	add	edx,DWORD[4+r13]
+	xor	eax,ebx
+	mov	edi,ebp
+	xor	edi,ebx
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	and	esi,edi
+	vmovdqu	YMMWORD[96+rsp],ymm7
+	add	ecx,DWORD[8+r13]
+	xor	esi,ebp
+	mov	edi,eax
+	xor	edi,ebp
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	and	edx,edi
+	add	ebx,DWORD[12+r13]
+	xor	edx,eax
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[32+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[36+r13]
+	xor	ebx,edx
+	mov	edi,ecx
+	xor	edi,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	and	ebp,edi
+	add	esi,DWORD[40+r13]
+	xor	ebp,ecx
+	mov	edi,ebx
+	xor	edi,ecx
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	and	eax,edi
+	vpalignr	ymm4,ymm1,ymm0,8
+	add	edx,DWORD[44+r13]
+	xor	eax,ebx
+	mov	edi,ebp
+	xor	edi,ebx
+	vpsrldq	ymm8,ymm3,4
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	vpxor	ymm4,ymm4,ymm0
+	vpxor	ymm8,ymm8,ymm2
+	xor	esi,ebp
+	add	edx,r12d
+	vpxor	ymm4,ymm4,ymm8
+	and	esi,edi
+	add	ecx,DWORD[64+r13]
+	xor	esi,ebp
+	mov	edi,eax
+	vpsrld	ymm8,ymm4,31
+	xor	edi,ebp
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	vpslldq	ymm10,ymm4,12
+	vpaddd	ymm4,ymm4,ymm4
+	rorx	esi,edx,2
+	xor	edx,eax
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm4,ymm4,ymm8
+	add	ecx,r12d
+	and	edx,edi
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm4,ymm4,ymm9
+	add	ebx,DWORD[68+r13]
+	xor	edx,eax
+	vpxor	ymm4,ymm4,ymm10
+	mov	edi,esi
+	xor	edi,eax
+	lea	ebx,[rdx*1+rbx]
+	vpaddd	ymm9,ymm4,ymm11
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	vmovdqu	YMMWORD[128+rsp],ymm9
+	add	ebx,r12d
+	and	ecx,edi
+	add	ebp,DWORD[72+r13]
+	xor	ecx,esi
+	mov	edi,edx
+	xor	edi,esi
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	and	ebx,edi
+	add	eax,DWORD[76+r13]
+	xor	ebx,edx
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	vpalignr	ymm5,ymm2,ymm1,8
+	add	esi,DWORD[96+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	vpsrldq	ymm8,ymm4,4
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	vpxor	ymm5,ymm5,ymm1
+	vpxor	ymm8,ymm8,ymm3
+	add	edx,DWORD[100+r13]
+	lea	edx,[rax*1+rdx]
+	vpxor	ymm5,ymm5,ymm8
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	xor	esi,ebp
+	add	edx,r12d
+	vpsrld	ymm8,ymm5,31
+	vmovdqu	ymm11,YMMWORD[((-32))+r14]
+	xor	esi,ebx
+	add	ecx,DWORD[104+r13]
+	lea	ecx,[rsi*1+rcx]
+	vpslldq	ymm10,ymm5,12
+	vpaddd	ymm5,ymm5,ymm5
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm5,ymm5,ymm8
+	xor	edx,eax
+	add	ecx,r12d
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm5,ymm5,ymm9
+	xor	edx,ebp
+	add	ebx,DWORD[108+r13]
+	lea	r13,[256+r13]
+	vpxor	ymm5,ymm5,ymm10
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	vpaddd	ymm9,ymm5,ymm11
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	vmovdqu	YMMWORD[160+rsp],ymm9
+	add	ebp,DWORD[((-128))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	vpalignr	ymm6,ymm3,ymm2,8
+	add	eax,DWORD[((-124))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	vpsrldq	ymm8,ymm5,4
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	vpxor	ymm6,ymm6,ymm2
+	vpxor	ymm8,ymm8,ymm4
+	add	esi,DWORD[((-120))+r13]
+	lea	esi,[rbp*1+rsi]
+	vpxor	ymm6,ymm6,ymm8
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	vpsrld	ymm8,ymm6,31
+	xor	eax,ecx
+	add	edx,DWORD[((-116))+r13]
+	lea	edx,[rax*1+rdx]
+	vpslldq	ymm10,ymm6,12
+	vpaddd	ymm6,ymm6,ymm6
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm6,ymm6,ymm8
+	xor	esi,ebp
+	add	edx,r12d
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm6,ymm6,ymm9
+	xor	esi,ebx
+	add	ecx,DWORD[((-96))+r13]
+	vpxor	ymm6,ymm6,ymm10
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	vpaddd	ymm9,ymm6,ymm11
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	vmovdqu	YMMWORD[192+rsp],ymm9
+	add	ebx,DWORD[((-92))+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	vpalignr	ymm7,ymm4,ymm3,8
+	add	ebp,DWORD[((-88))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	vpsrldq	ymm8,ymm6,4
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	vpxor	ymm7,ymm7,ymm3
+	vpxor	ymm8,ymm8,ymm5
+	add	eax,DWORD[((-84))+r13]
+	lea	eax,[rbx*1+rax]
+	vpxor	ymm7,ymm7,ymm8
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	vpsrld	ymm8,ymm7,31
+	xor	ebp,edx
+	add	esi,DWORD[((-64))+r13]
+	lea	esi,[rbp*1+rsi]
+	vpslldq	ymm10,ymm7,12
+	vpaddd	ymm7,ymm7,ymm7
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	vpsrld	ymm9,ymm10,30
+	vpor	ymm7,ymm7,ymm8
+	xor	eax,ebx
+	add	esi,r12d
+	vpslld	ymm10,ymm10,2
+	vpxor	ymm7,ymm7,ymm9
+	xor	eax,ecx
+	add	edx,DWORD[((-60))+r13]
+	vpxor	ymm7,ymm7,ymm10
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	rorx	eax,esi,2
+	vpaddd	ymm9,ymm7,ymm11
+	xor	esi,ebp
+	add	edx,r12d
+	xor	esi,ebx
+	vmovdqu	YMMWORD[224+rsp],ymm9
+	add	ecx,DWORD[((-56))+r13]
+	lea	ecx,[rsi*1+rcx]
+	rorx	r12d,edx,27
+	rorx	esi,edx,2
+	xor	edx,eax
+	add	ecx,r12d
+	xor	edx,ebp
+	add	ebx,DWORD[((-52))+r13]
+	lea	ebx,[rdx*1+rbx]
+	rorx	r12d,ecx,27
+	rorx	edx,ecx,2
+	xor	ecx,esi
+	add	ebx,r12d
+	xor	ecx,eax
+	add	ebp,DWORD[((-32))+r13]
+	lea	ebp,[rbp*1+rcx]
+	rorx	r12d,ebx,27
+	rorx	ecx,ebx,2
+	xor	ebx,edx
+	add	ebp,r12d
+	xor	ebx,esi
+	add	eax,DWORD[((-28))+r13]
+	lea	eax,[rbx*1+rax]
+	rorx	r12d,ebp,27
+	rorx	ebx,ebp,2
+	xor	ebp,ecx
+	add	eax,r12d
+	xor	ebp,edx
+	add	esi,DWORD[((-24))+r13]
+	lea	esi,[rbp*1+rsi]
+	rorx	r12d,eax,27
+	rorx	ebp,eax,2
+	xor	eax,ebx
+	add	esi,r12d
+	xor	eax,ecx
+	add	edx,DWORD[((-20))+r13]
+	lea	edx,[rax*1+rdx]
+	rorx	r12d,esi,27
+	add	edx,r12d
+	lea	r13,[128+rsp]
+
+
+	add	edx,DWORD[r8]
+	add	esi,DWORD[4+r8]
+	add	ebp,DWORD[8+r8]
+	mov	DWORD[r8],edx
+	add	ebx,DWORD[12+r8]
+	mov	DWORD[4+r8],esi
+	mov	eax,edx
+	add	ecx,DWORD[16+r8]
+	mov	r12d,ebp
+	mov	DWORD[8+r8],ebp
+	mov	edx,ebx
+
+	mov	DWORD[12+r8],ebx
+	mov	ebp,esi
+	mov	DWORD[16+r8],ecx
+
+	mov	esi,ecx
+	mov	ecx,r12d
+
+
+	cmp	r9,r10
+	jbe	NEAR $L$oop_avx2
+
+$L$done_avx2:
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-40-96))+r11]
+	movaps	xmm7,XMMWORD[((-40-80))+r11]
+	movaps	xmm8,XMMWORD[((-40-64))+r11]
+	movaps	xmm9,XMMWORD[((-40-48))+r11]
+	movaps	xmm10,XMMWORD[((-40-32))+r11]
+	movaps	xmm11,XMMWORD[((-40-16))+r11]
+	mov	r14,QWORD[((-40))+r11]
+
+	mov	r13,QWORD[((-32))+r11]
+
+	mov	r12,QWORD[((-24))+r11]
+
+	mov	rbp,QWORD[((-16))+r11]
+
+	mov	rbx,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$epilogue_avx2:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha1_block_data_order_avx2:
+section	.rdata rdata align=8
+ALIGN	64
+K_XX_XX:
+	DD	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+	DD	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+	DD	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+	DD	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+	DD	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+	DD	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+	DD	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+	DD	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DB	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+	DB	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+	DB	102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44
+	DB	32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60
+	DB	97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114
+	DB	103,62,0
+ALIGN	64
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[64+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+
+	jmp	NEAR $L$common_seh_tail
+
+
+ALIGN	16
+shaext_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue_shaext]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	lea	r10,[$L$epilogue_shaext]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-8-64))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,8
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+ALIGN	16
+ssse3_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[208+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-40-96))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,12
+	DD	0xa548f3fc
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_sha1_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_end_sha1_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_info_sha1_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_begin_sha1_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_end_sha1_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_info_sha1_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_sha1_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_sha1_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase
+	DD	$L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase
+	DD	$L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_sha1_block_data_order_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_hw:
+	DB	9,0,0,0
+	DD	shaext_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_ssse3:
+	DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx:
+	DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx2:
+	DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha256-586-apple.S b/gen/bcm/sha256-586-apple.S
new file mode 100644
index 0000000..8e74e68
--- /dev/null
+++ b/gen/bcm/sha256-586-apple.S
@@ -0,0 +1,5593 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha256_block_data_order_nohw
+.private_extern	_sha256_block_data_order_nohw
+.align	4
+_sha256_block_data_order_nohw:
+L_sha256_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L000pic_point
+L000pic_point:
+	popl	%ebp
+	leal	LK256-L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+L001no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	L002unrolled
+	jmp	L003loop
+.align	4,0x90
+L003loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	bswap	%eax
+	movl	12(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	bswap	%eax
+	movl	28(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%eax
+	movl	44(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	bswap	%eax
+	movl	60(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
+.align	4,0x90
+L00400_15:
+	movl	%edx,%ecx
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	L00400_15
+	movl	156(%esp),%ecx
+	jmp	L00516_63
+.align	4,0x90
+L00516_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	L00516_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	L003loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+LK256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	4,0x90
+L002unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	L006grand_loop
+.align	4,0x90
+L006grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	L006grand_loop
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha256_block_data_order_ssse3
+.private_extern	_sha256_block_data_order_ssse3
+.align	4
+_sha256_block_data_order_ssse3:
+L_sha256_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L007pic_point
+L007pic_point:
+	popl	%ebp
+	leal	LK256-L007pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	L008grand_ssse3
+.align	4,0x90
+L008grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	L009ssse3_00_47
+.align	4,0x90
+L009ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	L009ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	L008grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha256_block_data_order_avx
+.private_extern	_sha256_block_data_order_avx
+.align	4
+_sha256_block_data_order_avx:
+L_sha256_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L010pic_point
+L010pic_point:
+	popl	%ebp
+	leal	LK256-L010pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	L011grand_avx
+.align	5,0x90
+L011grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	L012avx_00_47
+.align	4,0x90
+L012avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	L012avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	L011grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha256-586-linux.S b/gen/bcm/sha256-586-linux.S
new file mode 100644
index 0000000..41b3759
--- /dev/null
+++ b/gen/bcm/sha256-586-linux.S
@@ -0,0 +1,5599 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	sha256_block_data_order_nohw
+.hidden	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,@function
+.align	16
+sha256_block_data_order_nohw:
+.L_sha256_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.LK256-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+.L001no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	.L002unrolled
+	jmp	.L003loop
+.align	16
+.L003loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	bswap	%eax
+	movl	12(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	bswap	%eax
+	movl	28(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%eax
+	movl	44(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	bswap	%eax
+	movl	60(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
+.align	16
+.L00400_15:
+	movl	%edx,%ecx
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00400_15
+	movl	156(%esp),%ecx
+	jmp	.L00516_63
+.align	16
+.L00516_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00516_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L003loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.LK256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	16
+.L002unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	.L006grand_loop
+.align	16
+.L006grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	.L006grand_loop
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha256_block_data_order_nohw,.-.L_sha256_block_data_order_nohw_begin
+.globl	sha256_block_data_order_ssse3
+.hidden	sha256_block_data_order_ssse3
+.type	sha256_block_data_order_ssse3,@function
+.align	16
+sha256_block_data_order_ssse3:
+.L_sha256_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L007pic_point
+.L007pic_point:
+	popl	%ebp
+	leal	.LK256-.L007pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	.L008grand_ssse3
+.align	16
+.L008grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	.L009ssse3_00_47
+.align	16
+.L009ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L009ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L008grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha256_block_data_order_ssse3,.-.L_sha256_block_data_order_ssse3_begin
+.globl	sha256_block_data_order_avx
+.hidden	sha256_block_data_order_avx
+.type	sha256_block_data_order_avx,@function
+.align	16
+sha256_block_data_order_avx:
+.L_sha256_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L010pic_point
+.L010pic_point:
+	popl	%ebp
+	leal	.LK256-.L010pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L011grand_avx
+.align	32
+.L011grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L012avx_00_47
+.align	16
+.L012avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L012avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L011grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha256_block_data_order_avx,.-.L_sha256_block_data_order_avx_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha256-586-win.asm b/gen/bcm/sha256-586-win.asm
new file mode 100644
index 0000000..0ef244d
--- /dev/null
+++ b/gen/bcm/sha256-586-win.asm
@@ -0,0 +1,5601 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_sha256_block_data_order_nohw
+align	16
+_sha256_block_data_order_nohw:
+L$_sha256_block_data_order_nohw_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	ebx,esp
+	call	L$000pic_point
+L$000pic_point:
+	pop	ebp
+	lea	ebp,[(L$K256-L$000pic_point)+ebp]
+	sub	esp,16
+	and	esp,-64
+	shl	eax,6
+	add	eax,edi
+	mov	DWORD [esp],esi
+	mov	DWORD [4+esp],edi
+	mov	DWORD [8+esp],eax
+	mov	DWORD [12+esp],ebx
+L$001no_xmm:
+	sub	eax,edi
+	cmp	eax,256
+	jae	NEAR L$002unrolled
+	jmp	NEAR L$003loop
+align	16
+L$003loop:
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	bswap	eax
+	mov	edx,DWORD [12+edi]
+	bswap	ebx
+	push	eax
+	bswap	ecx
+	push	ebx
+	bswap	edx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [16+edi]
+	mov	ebx,DWORD [20+edi]
+	mov	ecx,DWORD [24+edi]
+	bswap	eax
+	mov	edx,DWORD [28+edi]
+	bswap	ebx
+	push	eax
+	bswap	ecx
+	push	ebx
+	bswap	edx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [32+edi]
+	mov	ebx,DWORD [36+edi]
+	mov	ecx,DWORD [40+edi]
+	bswap	eax
+	mov	edx,DWORD [44+edi]
+	bswap	ebx
+	push	eax
+	bswap	ecx
+	push	ebx
+	bswap	edx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [48+edi]
+	mov	ebx,DWORD [52+edi]
+	mov	ecx,DWORD [56+edi]
+	bswap	eax
+	mov	edx,DWORD [60+edi]
+	bswap	ebx
+	push	eax
+	bswap	ecx
+	push	ebx
+	bswap	edx
+	push	ecx
+	push	edx
+	add	edi,64
+	lea	esp,[esp-36]
+	mov	DWORD [104+esp],edi
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edi,DWORD [12+esi]
+	mov	DWORD [8+esp],ebx
+	xor	ebx,ecx
+	mov	DWORD [12+esp],ecx
+	mov	DWORD [16+esp],edi
+	mov	DWORD [esp],ebx
+	mov	edx,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edi,DWORD [28+esi]
+	mov	DWORD [24+esp],ebx
+	mov	DWORD [28+esp],ecx
+	mov	DWORD [32+esp],edi
+align	16
+L$00400_15:
+	mov	ecx,edx
+	mov	esi,DWORD [24+esp]
+	ror	ecx,14
+	mov	edi,DWORD [28+esp]
+	xor	ecx,edx
+	xor	esi,edi
+	mov	ebx,DWORD [96+esp]
+	ror	ecx,5
+	and	esi,edx
+	mov	DWORD [20+esp],edx
+	xor	edx,ecx
+	add	ebx,DWORD [32+esp]
+	xor	esi,edi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,esi
+	ror	ecx,9
+	add	ebx,edx
+	mov	edi,DWORD [8+esp]
+	xor	ecx,eax
+	mov	DWORD [4+esp],eax
+	lea	esp,[esp-4]
+	ror	ecx,11
+	mov	esi,DWORD [ebp]
+	xor	ecx,eax
+	mov	edx,DWORD [20+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	ebx,esi
+	mov	DWORD [esp],eax
+	add	edx,ebx
+	and	eax,DWORD [4+esp]
+	add	ebx,ecx
+	xor	eax,edi
+	add	ebp,4
+	add	eax,ebx
+	cmp	esi,3248222580
+	jne	NEAR L$00400_15
+	mov	ecx,DWORD [156+esp]
+	jmp	NEAR L$00516_63
+align	16
+L$00516_63:
+	mov	ebx,ecx
+	mov	esi,DWORD [104+esp]
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [160+esp]
+	shr	edi,10
+	add	ebx,DWORD [124+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [24+esp]
+	ror	ecx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	ecx,edx
+	xor	esi,edi
+	mov	DWORD [96+esp],ebx
+	ror	ecx,5
+	and	esi,edx
+	mov	DWORD [20+esp],edx
+	xor	edx,ecx
+	add	ebx,DWORD [32+esp]
+	xor	esi,edi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,esi
+	ror	ecx,9
+	add	ebx,edx
+	mov	edi,DWORD [8+esp]
+	xor	ecx,eax
+	mov	DWORD [4+esp],eax
+	lea	esp,[esp-4]
+	ror	ecx,11
+	mov	esi,DWORD [ebp]
+	xor	ecx,eax
+	mov	edx,DWORD [20+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	ebx,esi
+	mov	DWORD [esp],eax
+	add	edx,ebx
+	and	eax,DWORD [4+esp]
+	add	ebx,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [156+esp]
+	add	ebp,4
+	add	eax,ebx
+	cmp	esi,3329325298
+	jne	NEAR L$00516_63
+	mov	esi,DWORD [356+esp]
+	mov	ebx,DWORD [8+esp]
+	mov	ecx,DWORD [16+esp]
+	add	eax,DWORD [esi]
+	add	ebx,DWORD [4+esi]
+	add	edi,DWORD [8+esi]
+	add	ecx,DWORD [12+esi]
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],ebx
+	mov	DWORD [8+esi],edi
+	mov	DWORD [12+esi],ecx
+	mov	eax,DWORD [24+esp]
+	mov	ebx,DWORD [28+esp]
+	mov	ecx,DWORD [32+esp]
+	mov	edi,DWORD [360+esp]
+	add	edx,DWORD [16+esi]
+	add	eax,DWORD [20+esi]
+	add	ebx,DWORD [24+esi]
+	add	ecx,DWORD [28+esi]
+	mov	DWORD [16+esi],edx
+	mov	DWORD [20+esi],eax
+	mov	DWORD [24+esi],ebx
+	mov	DWORD [28+esi],ecx
+	lea	esp,[356+esp]
+	sub	ebp,256
+	cmp	edi,DWORD [8+esp]
+	jb	NEAR L$003loop
+	mov	esp,DWORD [12+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$K256:
+dd	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+dd	66051,67438087,134810123,202182159
+db	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+db	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+db	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+db	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+db	62,0
+align	16
+L$002unrolled:
+	lea	esp,[esp-96]
+	mov	eax,DWORD [esi]
+	mov	ebp,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	ebx,DWORD [12+esi]
+	mov	DWORD [4+esp],ebp
+	xor	ebp,ecx
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],ebx
+	mov	edx,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	esi,DWORD [28+esi]
+	mov	DWORD [20+esp],ebx
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],esi
+	jmp	NEAR L$006grand_loop
+align	16
+L$006grand_loop:
+	mov	ebx,DWORD [edi]
+	mov	ecx,DWORD [4+edi]
+	bswap	ebx
+	mov	esi,DWORD [8+edi]
+	bswap	ecx
+	mov	DWORD [32+esp],ebx
+	bswap	esi
+	mov	DWORD [36+esp],ecx
+	mov	DWORD [40+esp],esi
+	mov	ebx,DWORD [12+edi]
+	mov	ecx,DWORD [16+edi]
+	bswap	ebx
+	mov	esi,DWORD [20+edi]
+	bswap	ecx
+	mov	DWORD [44+esp],ebx
+	bswap	esi
+	mov	DWORD [48+esp],ecx
+	mov	DWORD [52+esp],esi
+	mov	ebx,DWORD [24+edi]
+	mov	ecx,DWORD [28+edi]
+	bswap	ebx
+	mov	esi,DWORD [32+edi]
+	bswap	ecx
+	mov	DWORD [56+esp],ebx
+	bswap	esi
+	mov	DWORD [60+esp],ecx
+	mov	DWORD [64+esp],esi
+	mov	ebx,DWORD [36+edi]
+	mov	ecx,DWORD [40+edi]
+	bswap	ebx
+	mov	esi,DWORD [44+edi]
+	bswap	ecx
+	mov	DWORD [68+esp],ebx
+	bswap	esi
+	mov	DWORD [72+esp],ecx
+	mov	DWORD [76+esp],esi
+	mov	ebx,DWORD [48+edi]
+	mov	ecx,DWORD [52+edi]
+	bswap	ebx
+	mov	esi,DWORD [56+edi]
+	bswap	ecx
+	mov	DWORD [80+esp],ebx
+	bswap	esi
+	mov	DWORD [84+esp],ecx
+	mov	DWORD [88+esp],esi
+	mov	ebx,DWORD [60+edi]
+	add	edi,64
+	bswap	ebx
+	mov	DWORD [100+esp],edi
+	mov	DWORD [92+esp],ebx
+	mov	ecx,edx
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [32+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1116352408+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [36+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1899447441+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [40+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3049323471+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [44+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3921009573+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [48+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[961987163+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [52+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1508970993+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [56+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2453635748+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [60+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2870763221+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [64+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3624381080+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [68+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[310598401+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [72+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[607225278+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [76+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1426881987+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [80+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1925078388+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [84+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2162078206+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	ecx,edx
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	ebx,DWORD [88+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2614888103+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	esi,edx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	ebx,DWORD [92+esp]
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3248222580+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [36+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [88+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [32+esp]
+	shr	edi,10
+	add	ebx,DWORD [68+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [32+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3835390401+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [40+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [92+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [36+esp]
+	shr	edi,10
+	add	ebx,DWORD [72+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [36+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[4022224774+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [44+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [32+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [40+esp]
+	shr	edi,10
+	add	ebx,DWORD [76+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [40+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[264347078+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [48+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [36+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [44+esp]
+	shr	edi,10
+	add	ebx,DWORD [80+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [44+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[604807628+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [52+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [40+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [48+esp]
+	shr	edi,10
+	add	ebx,DWORD [84+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [48+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[770255983+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [56+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [44+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [52+esp]
+	shr	edi,10
+	add	ebx,DWORD [88+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [52+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1249150122+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [60+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [48+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [56+esp]
+	shr	edi,10
+	add	ebx,DWORD [92+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	DWORD [56+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1555081692+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [64+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [52+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [60+esp]
+	shr	edi,10
+	add	ebx,DWORD [32+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	DWORD [60+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1996064986+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [68+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [56+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [64+esp]
+	shr	edi,10
+	add	ebx,DWORD [36+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [64+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2554220882+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [72+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [60+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [68+esp]
+	shr	edi,10
+	add	ebx,DWORD [40+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [68+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2821834349+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [76+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [64+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [72+esp]
+	shr	edi,10
+	add	ebx,DWORD [44+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [72+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2952996808+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [80+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [68+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [76+esp]
+	shr	edi,10
+	add	ebx,DWORD [48+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [76+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3210313671+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [84+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [72+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [80+esp]
+	shr	edi,10
+	add	ebx,DWORD [52+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [80+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3336571891+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [88+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [76+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [84+esp]
+	shr	edi,10
+	add	ebx,DWORD [56+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [84+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3584528711+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [92+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [80+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [88+esp]
+	shr	edi,10
+	add	ebx,DWORD [60+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	DWORD [88+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[113926993+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [32+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [84+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [92+esp]
+	shr	edi,10
+	add	ebx,DWORD [64+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	DWORD [92+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[338241895+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [36+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [88+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [32+esp]
+	shr	edi,10
+	add	ebx,DWORD [68+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [32+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[666307205+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [40+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [92+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [36+esp]
+	shr	edi,10
+	add	ebx,DWORD [72+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [36+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[773529912+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [44+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [32+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [40+esp]
+	shr	edi,10
+	add	ebx,DWORD [76+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [40+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1294757372+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [48+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [36+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [44+esp]
+	shr	edi,10
+	add	ebx,DWORD [80+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [44+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1396182291+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [52+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [40+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [48+esp]
+	shr	edi,10
+	add	ebx,DWORD [84+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [48+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1695183700+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [56+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [44+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [52+esp]
+	shr	edi,10
+	add	ebx,DWORD [88+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [52+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1986661051+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [60+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [48+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [56+esp]
+	shr	edi,10
+	add	ebx,DWORD [92+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	DWORD [56+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2177026350+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [64+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [52+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [60+esp]
+	shr	edi,10
+	add	ebx,DWORD [32+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	DWORD [60+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2456956037+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [68+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [56+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [64+esp]
+	shr	edi,10
+	add	ebx,DWORD [36+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [64+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2730485921+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [72+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [60+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [68+esp]
+	shr	edi,10
+	add	ebx,DWORD [40+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [68+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2820302411+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [76+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [64+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [72+esp]
+	shr	edi,10
+	add	ebx,DWORD [44+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [72+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3259730800+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [80+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [68+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [76+esp]
+	shr	edi,10
+	add	ebx,DWORD [48+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [76+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3345764771+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [84+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [72+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [80+esp]
+	shr	edi,10
+	add	ebx,DWORD [52+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [80+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3516065817+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [88+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [76+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [84+esp]
+	shr	edi,10
+	add	ebx,DWORD [56+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [84+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3600352804+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [92+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [80+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [88+esp]
+	shr	edi,10
+	add	ebx,DWORD [60+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	DWORD [88+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[4094571909+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [32+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [84+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [92+esp]
+	shr	edi,10
+	add	ebx,DWORD [64+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	DWORD [92+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[275423344+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [36+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [88+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [32+esp]
+	shr	edi,10
+	add	ebx,DWORD [68+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [32+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[430227734+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [40+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [92+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [36+esp]
+	shr	edi,10
+	add	ebx,DWORD [72+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [36+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[506948616+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [44+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [32+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [40+esp]
+	shr	edi,10
+	add	ebx,DWORD [76+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [40+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[659060556+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [48+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [36+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [44+esp]
+	shr	edi,10
+	add	ebx,DWORD [80+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [44+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[883997877+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [52+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [40+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [48+esp]
+	shr	edi,10
+	add	ebx,DWORD [84+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [48+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[958139571+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [56+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [44+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [52+esp]
+	shr	edi,10
+	add	ebx,DWORD [88+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [52+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1322822218+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [60+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [48+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [56+esp]
+	shr	edi,10
+	add	ebx,DWORD [92+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	mov	DWORD [56+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1537002063+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [64+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [52+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [60+esp]
+	shr	edi,10
+	add	ebx,DWORD [32+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	mov	DWORD [60+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[1747873779+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [68+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [56+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [64+esp]
+	shr	edi,10
+	add	ebx,DWORD [36+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [20+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	DWORD [64+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [28+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [4+esp]
+	xor	ecx,eax
+	mov	DWORD [esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[1955562222+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [72+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [12+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [60+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [68+esp]
+	shr	edi,10
+	add	ebx,DWORD [40+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [16+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [20+esp]
+	xor	edx,esi
+	mov	DWORD [68+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [12+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [24+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [esp]
+	xor	esi,ebp
+	mov	DWORD [28+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2024104815+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [76+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,esi
+	mov	esi,DWORD [64+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [72+esp]
+	shr	edi,10
+	add	ebx,DWORD [44+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [12+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	DWORD [72+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [20+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [28+esp]
+	xor	ecx,eax
+	mov	DWORD [24+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2227730452+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [80+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [4+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [68+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [76+esp]
+	shr	edi,10
+	add	ebx,DWORD [48+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [8+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [12+esp]
+	xor	edx,esi
+	mov	DWORD [76+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [4+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [16+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [24+esp]
+	xor	esi,ebp
+	mov	DWORD [20+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2361852424+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [84+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,esi
+	mov	esi,DWORD [72+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [80+esp]
+	shr	edi,10
+	add	ebx,DWORD [52+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [4+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	DWORD [80+esp],ebx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [12+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [20+esp]
+	xor	ecx,eax
+	mov	DWORD [16+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[2428436474+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [88+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [28+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [76+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [84+esp]
+	shr	edi,10
+	add	ebx,DWORD [56+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [4+esp]
+	xor	edx,esi
+	mov	DWORD [84+esp],ebx
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [28+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [8+esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [16+esp]
+	xor	esi,ebp
+	mov	DWORD [12+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[2756734187+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	mov	ecx,DWORD [92+esp]
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,esi
+	mov	esi,DWORD [80+esp]
+	mov	ebx,ecx
+	ror	ecx,11
+	mov	edi,esi
+	ror	esi,2
+	xor	ecx,ebx
+	shr	ebx,3
+	ror	ecx,7
+	xor	esi,edi
+	xor	ebx,ecx
+	ror	esi,17
+	add	ebx,DWORD [88+esp]
+	shr	edi,10
+	add	ebx,DWORD [60+esp]
+	mov	ecx,edx
+	xor	edi,esi
+	mov	esi,DWORD [28+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [esp]
+	xor	edx,ecx
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	add	ebx,DWORD [4+esp]
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	ebx,edi
+	ror	ecx,9
+	mov	esi,eax
+	mov	edi,DWORD [12+esp]
+	xor	ecx,eax
+	mov	DWORD [8+esp],eax
+	xor	eax,edi
+	ror	ecx,11
+	and	ebp,eax
+	lea	edx,[3204031479+edx*1+ebx]
+	xor	ecx,esi
+	xor	ebp,edi
+	mov	esi,DWORD [32+esp]
+	ror	ecx,2
+	add	ebp,edx
+	add	edx,DWORD [20+esp]
+	add	ebp,ecx
+	mov	ecx,DWORD [84+esp]
+	mov	ebx,esi
+	ror	esi,11
+	mov	edi,ecx
+	ror	ecx,2
+	xor	esi,ebx
+	shr	ebx,3
+	ror	esi,7
+	xor	ecx,edi
+	xor	ebx,esi
+	ror	ecx,17
+	add	ebx,DWORD [92+esp]
+	shr	edi,10
+	add	ebx,DWORD [64+esp]
+	mov	esi,edx
+	xor	edi,ecx
+	mov	ecx,DWORD [24+esp]
+	ror	edx,14
+	add	ebx,edi
+	mov	edi,DWORD [28+esp]
+	xor	edx,esi
+	xor	ecx,edi
+	ror	edx,5
+	and	ecx,esi
+	mov	DWORD [20+esp],esi
+	xor	edx,esi
+	add	ebx,DWORD [esp]
+	xor	edi,ecx
+	ror	edx,6
+	mov	esi,ebp
+	add	ebx,edi
+	ror	esi,9
+	mov	ecx,ebp
+	mov	edi,DWORD [8+esp]
+	xor	esi,ebp
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	ror	esi,11
+	and	eax,ebp
+	lea	edx,[3329325298+edx*1+ebx]
+	xor	esi,ecx
+	xor	eax,edi
+	ror	esi,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,esi
+	mov	esi,DWORD [96+esp]
+	xor	ebp,edi
+	mov	ecx,DWORD [12+esp]
+	add	eax,DWORD [esi]
+	add	ebp,DWORD [4+esi]
+	add	edi,DWORD [8+esi]
+	add	ecx,DWORD [12+esi]
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],ebp
+	mov	DWORD [8+esi],edi
+	mov	DWORD [12+esi],ecx
+	mov	DWORD [4+esp],ebp
+	xor	ebp,edi
+	mov	DWORD [8+esp],edi
+	mov	DWORD [12+esp],ecx
+	mov	edi,DWORD [20+esp]
+	mov	ebx,DWORD [24+esp]
+	mov	ecx,DWORD [28+esp]
+	add	edx,DWORD [16+esi]
+	add	edi,DWORD [20+esi]
+	add	ebx,DWORD [24+esi]
+	add	ecx,DWORD [28+esi]
+	mov	DWORD [16+esi],edx
+	mov	DWORD [20+esi],edi
+	mov	DWORD [24+esi],ebx
+	mov	DWORD [28+esi],ecx
+	mov	DWORD [20+esp],edi
+	mov	edi,DWORD [100+esp]
+	mov	DWORD [24+esp],ebx
+	mov	DWORD [28+esp],ecx
+	cmp	edi,DWORD [104+esp]
+	jb	NEAR L$006grand_loop
+	mov	esp,DWORD [108+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_sha256_block_data_order_ssse3
+align	16
+_sha256_block_data_order_ssse3:
+L$_sha256_block_data_order_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	ebx,esp
+	call	L$007pic_point
+L$007pic_point:
+	pop	ebp
+	lea	ebp,[(L$K256-L$007pic_point)+ebp]
+	sub	esp,16
+	and	esp,-64
+	shl	eax,6
+	add	eax,edi
+	mov	DWORD [esp],esi
+	mov	DWORD [4+esp],edi
+	mov	DWORD [8+esp],eax
+	mov	DWORD [12+esp],ebx
+	lea	esp,[esp-96]
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edi,DWORD [12+esi]
+	mov	DWORD [4+esp],ebx
+	xor	ebx,ecx
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],edi
+	mov	edx,DWORD [16+esi]
+	mov	edi,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	esi,DWORD [28+esi]
+	mov	DWORD [20+esp],edi
+	mov	edi,DWORD [100+esp]
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],esi
+	movdqa	xmm7,[256+ebp]
+	jmp	NEAR L$008grand_ssse3
+align	16
+L$008grand_ssse3:
+	movdqu	xmm0,[edi]
+	movdqu	xmm1,[16+edi]
+	movdqu	xmm2,[32+edi]
+	movdqu	xmm3,[48+edi]
+	add	edi,64
+db	102,15,56,0,199
+	mov	DWORD [100+esp],edi
+db	102,15,56,0,207
+	movdqa	xmm4,[ebp]
+db	102,15,56,0,215
+	movdqa	xmm5,[16+ebp]
+	paddd	xmm4,xmm0
+db	102,15,56,0,223
+	movdqa	xmm6,[32+ebp]
+	paddd	xmm5,xmm1
+	movdqa	xmm7,[48+ebp]
+	movdqa	[32+esp],xmm4
+	paddd	xmm6,xmm2
+	movdqa	[48+esp],xmm5
+	paddd	xmm7,xmm3
+	movdqa	[64+esp],xmm6
+	movdqa	[80+esp],xmm7
+	jmp	NEAR L$009ssse3_00_47
+align	16
+L$009ssse3_00_47:
+	add	ebp,64
+	mov	ecx,edx
+	movdqa	xmm4,xmm1
+	ror	edx,14
+	mov	esi,DWORD [20+esp]
+	movdqa	xmm7,xmm3
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+db	102,15,58,15,224,4
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+db	102,15,58,15,250,4
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	movdqa	xmm5,xmm4
+	ror	edx,6
+	mov	ecx,eax
+	movdqa	xmm6,xmm4
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	psrld	xmm4,3
+	mov	esi,eax
+	ror	ecx,9
+	paddd	xmm0,xmm7
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	psrld	xmm6,7
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	ror	ecx,11
+	and	ebx,eax
+	pshufd	xmm7,xmm3,250
+	xor	ecx,esi
+	add	edx,DWORD [32+esp]
+	pslld	xmm5,14
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm4,xmm6
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	psrld	xmm6,11
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm4,xmm5
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	pslld	xmm5,11
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	ror	edx,5
+	pxor	xmm4,xmm6
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	movdqa	xmm6,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	pxor	xmm4,xmm5
+	mov	ecx,ebx
+	add	edx,edi
+	psrld	xmm7,10
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm0,xmm4
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	psrlq	xmm6,17
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	ror	ecx,11
+	pxor	xmm7,xmm6
+	and	eax,ebx
+	xor	ecx,esi
+	psrlq	xmm6,2
+	add	edx,DWORD [36+esp]
+	xor	eax,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	pshufd	xmm7,xmm7,128
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	psrldq	xmm7,8
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	paddd	xmm0,xmm7
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [24+esp],eax
+	pshufd	xmm7,xmm0,80
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	movdqa	xmm6,xmm7
+	ror	ecx,11
+	psrld	xmm7,10
+	and	ebx,eax
+	psrlq	xmm6,17
+	xor	ecx,esi
+	add	edx,DWORD [40+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	psrlq	xmm6,2
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm7,xmm6
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	pshufd	xmm7,xmm7,8
+	xor	esi,edi
+	ror	edx,5
+	movdqa	xmm6,[ebp]
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	pslldq	xmm7,8
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm0,xmm7
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	paddd	xmm6,xmm0
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [44+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	movdqa	[32+esp],xmm6
+	mov	ecx,edx
+	movdqa	xmm4,xmm2
+	ror	edx,14
+	mov	esi,DWORD [4+esp]
+	movdqa	xmm7,xmm0
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+db	102,15,58,15,225,4
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+db	102,15,58,15,251,4
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	movdqa	xmm5,xmm4
+	ror	edx,6
+	mov	ecx,eax
+	movdqa	xmm6,xmm4
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	psrld	xmm4,3
+	mov	esi,eax
+	ror	ecx,9
+	paddd	xmm1,xmm7
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	psrld	xmm6,7
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	ror	ecx,11
+	and	ebx,eax
+	pshufd	xmm7,xmm0,250
+	xor	ecx,esi
+	add	edx,DWORD [48+esp]
+	pslld	xmm5,14
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm4,xmm6
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	psrld	xmm6,11
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm4,xmm5
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	pslld	xmm5,11
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	ror	edx,5
+	pxor	xmm4,xmm6
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	movdqa	xmm6,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	pxor	xmm4,xmm5
+	mov	ecx,ebx
+	add	edx,edi
+	psrld	xmm7,10
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm1,xmm4
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	psrlq	xmm6,17
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	ror	ecx,11
+	pxor	xmm7,xmm6
+	and	eax,ebx
+	xor	ecx,esi
+	psrlq	xmm6,2
+	add	edx,DWORD [52+esp]
+	xor	eax,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	pshufd	xmm7,xmm7,128
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	psrldq	xmm7,8
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	paddd	xmm1,xmm7
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [8+esp],eax
+	pshufd	xmm7,xmm1,80
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	movdqa	xmm6,xmm7
+	ror	ecx,11
+	psrld	xmm7,10
+	and	ebx,eax
+	psrlq	xmm6,17
+	xor	ecx,esi
+	add	edx,DWORD [56+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	psrlq	xmm6,2
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm7,xmm6
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	pshufd	xmm7,xmm7,8
+	xor	esi,edi
+	ror	edx,5
+	movdqa	xmm6,[16+ebp]
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	pslldq	xmm7,8
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm1,xmm7
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	paddd	xmm6,xmm1
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [60+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	movdqa	[48+esp],xmm6
+	mov	ecx,edx
+	movdqa	xmm4,xmm3
+	ror	edx,14
+	mov	esi,DWORD [20+esp]
+	movdqa	xmm7,xmm1
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+db	102,15,58,15,226,4
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+db	102,15,58,15,248,4
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	movdqa	xmm5,xmm4
+	ror	edx,6
+	mov	ecx,eax
+	movdqa	xmm6,xmm4
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	psrld	xmm4,3
+	mov	esi,eax
+	ror	ecx,9
+	paddd	xmm2,xmm7
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	psrld	xmm6,7
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	ror	ecx,11
+	and	ebx,eax
+	pshufd	xmm7,xmm1,250
+	xor	ecx,esi
+	add	edx,DWORD [64+esp]
+	pslld	xmm5,14
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm4,xmm6
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	psrld	xmm6,11
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm4,xmm5
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	pslld	xmm5,11
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	ror	edx,5
+	pxor	xmm4,xmm6
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	movdqa	xmm6,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	pxor	xmm4,xmm5
+	mov	ecx,ebx
+	add	edx,edi
+	psrld	xmm7,10
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm2,xmm4
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	psrlq	xmm6,17
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	ror	ecx,11
+	pxor	xmm7,xmm6
+	and	eax,ebx
+	xor	ecx,esi
+	psrlq	xmm6,2
+	add	edx,DWORD [68+esp]
+	xor	eax,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	pshufd	xmm7,xmm7,128
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	psrldq	xmm7,8
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	paddd	xmm2,xmm7
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [24+esp],eax
+	pshufd	xmm7,xmm2,80
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	movdqa	xmm6,xmm7
+	ror	ecx,11
+	psrld	xmm7,10
+	and	ebx,eax
+	psrlq	xmm6,17
+	xor	ecx,esi
+	add	edx,DWORD [72+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	psrlq	xmm6,2
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm7,xmm6
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	pshufd	xmm7,xmm7,8
+	xor	esi,edi
+	ror	edx,5
+	movdqa	xmm6,[32+ebp]
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	pslldq	xmm7,8
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm2,xmm7
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	paddd	xmm6,xmm2
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [76+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	movdqa	[64+esp],xmm6
+	mov	ecx,edx
+	movdqa	xmm4,xmm0
+	ror	edx,14
+	mov	esi,DWORD [4+esp]
+	movdqa	xmm7,xmm2
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+db	102,15,58,15,227,4
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+db	102,15,58,15,249,4
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	movdqa	xmm5,xmm4
+	ror	edx,6
+	mov	ecx,eax
+	movdqa	xmm6,xmm4
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	psrld	xmm4,3
+	mov	esi,eax
+	ror	ecx,9
+	paddd	xmm3,xmm7
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	psrld	xmm6,7
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	ror	ecx,11
+	and	ebx,eax
+	pshufd	xmm7,xmm2,250
+	xor	ecx,esi
+	add	edx,DWORD [80+esp]
+	pslld	xmm5,14
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm4,xmm6
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	psrld	xmm6,11
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm4,xmm5
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	pslld	xmm5,11
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	ror	edx,5
+	pxor	xmm4,xmm6
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	movdqa	xmm6,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	pxor	xmm4,xmm5
+	mov	ecx,ebx
+	add	edx,edi
+	psrld	xmm7,10
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm3,xmm4
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	psrlq	xmm6,17
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	ror	ecx,11
+	pxor	xmm7,xmm6
+	and	eax,ebx
+	xor	ecx,esi
+	psrlq	xmm6,2
+	add	edx,DWORD [84+esp]
+	xor	eax,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	pshufd	xmm7,xmm7,128
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	psrldq	xmm7,8
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	paddd	xmm3,xmm7
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [8+esp],eax
+	pshufd	xmm7,xmm3,80
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	movdqa	xmm6,xmm7
+	ror	ecx,11
+	psrld	xmm7,10
+	and	ebx,eax
+	psrlq	xmm6,17
+	xor	ecx,esi
+	add	edx,DWORD [88+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	pxor	xmm7,xmm6
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	psrlq	xmm6,2
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	pxor	xmm7,xmm6
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	pshufd	xmm7,xmm7,8
+	xor	esi,edi
+	ror	edx,5
+	movdqa	xmm6,[48+ebp]
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	pslldq	xmm7,8
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	paddd	xmm3,xmm7
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	paddd	xmm6,xmm3
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [92+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	movdqa	[80+esp],xmm6
+	cmp	DWORD [64+ebp],66051
+	jne	NEAR L$009ssse3_00_47
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [20+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [32+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [36+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [24+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [40+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [44+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [4+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [48+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [52+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [8+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [56+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [60+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [20+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [64+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [68+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [24+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [72+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [76+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [4+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [80+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [84+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	ror	ecx,9
+	mov	DWORD [8+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	ror	ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [88+esp]
+	xor	ebx,edi
+	ror	ecx,2
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	ror	edx,14
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	ror	edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	ror	edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	ror	ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	ror	ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [92+esp]
+	xor	eax,edi
+	ror	ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	mov	esi,DWORD [96+esp]
+	xor	ebx,edi
+	mov	ecx,DWORD [12+esp]
+	add	eax,DWORD [esi]
+	add	ebx,DWORD [4+esi]
+	add	edi,DWORD [8+esi]
+	add	ecx,DWORD [12+esi]
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],ebx
+	mov	DWORD [8+esi],edi
+	mov	DWORD [12+esi],ecx
+	mov	DWORD [4+esp],ebx
+	xor	ebx,edi
+	mov	DWORD [8+esp],edi
+	mov	DWORD [12+esp],ecx
+	mov	edi,DWORD [20+esp]
+	mov	ecx,DWORD [24+esp]
+	add	edx,DWORD [16+esi]
+	add	edi,DWORD [20+esi]
+	add	ecx,DWORD [24+esi]
+	mov	DWORD [16+esi],edx
+	mov	DWORD [20+esi],edi
+	mov	DWORD [20+esp],edi
+	mov	edi,DWORD [28+esp]
+	mov	DWORD [24+esi],ecx
+	add	edi,DWORD [28+esi]
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esi],edi
+	mov	DWORD [28+esp],edi
+	mov	edi,DWORD [100+esp]
+	movdqa	xmm7,[64+ebp]
+	sub	ebp,192
+	cmp	edi,DWORD [104+esp]
+	jb	NEAR L$008grand_ssse3
+	mov	esp,DWORD [108+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_sha256_block_data_order_avx
+align	16
+_sha256_block_data_order_avx:
+L$_sha256_block_data_order_avx_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	ebx,esp
+	call	L$010pic_point
+L$010pic_point:
+	pop	ebp
+	lea	ebp,[(L$K256-L$010pic_point)+ebp]
+	sub	esp,16
+	and	esp,-64
+	shl	eax,6
+	add	eax,edi
+	mov	DWORD [esp],esi
+	mov	DWORD [4+esp],edi
+	mov	DWORD [8+esp],eax
+	mov	DWORD [12+esp],ebx
+	lea	esp,[esp-96]
+	vzeroall
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edi,DWORD [12+esi]
+	mov	DWORD [4+esp],ebx
+	xor	ebx,ecx
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],edi
+	mov	edx,DWORD [16+esi]
+	mov	edi,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	esi,DWORD [28+esi]
+	mov	DWORD [20+esp],edi
+	mov	edi,DWORD [100+esp]
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],esi
+	vmovdqa	xmm7,[256+ebp]
+	jmp	NEAR L$011grand_avx
+align	32
+L$011grand_avx:
+	vmovdqu	xmm0,[edi]
+	vmovdqu	xmm1,[16+edi]
+	vmovdqu	xmm2,[32+edi]
+	vmovdqu	xmm3,[48+edi]
+	add	edi,64
+	vpshufb	xmm0,xmm0,xmm7
+	mov	DWORD [100+esp],edi
+	vpshufb	xmm1,xmm1,xmm7
+	vpshufb	xmm2,xmm2,xmm7
+	vpaddd	xmm4,xmm0,[ebp]
+	vpshufb	xmm3,xmm3,xmm7
+	vpaddd	xmm5,xmm1,[16+ebp]
+	vpaddd	xmm6,xmm2,[32+ebp]
+	vpaddd	xmm7,xmm3,[48+ebp]
+	vmovdqa	[32+esp],xmm4
+	vmovdqa	[48+esp],xmm5
+	vmovdqa	[64+esp],xmm6
+	vmovdqa	[80+esp],xmm7
+	jmp	NEAR L$012avx_00_47
+align	16
+L$012avx_00_47:
+	add	ebp,64
+	vpalignr	xmm4,xmm1,xmm0,4
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [20+esp]
+	vpalignr	xmm7,xmm3,xmm2,4
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm4,7
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	vpaddd	xmm0,xmm0,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrld	xmm7,xmm4,3
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	vpslld	xmm5,xmm4,14
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [esp],eax
+	vpxor	xmm4,xmm7,xmm6
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	vpshufd	xmm7,xmm3,250
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpsrld	xmm6,xmm6,11
+	add	edx,DWORD [32+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpxor	xmm4,xmm4,xmm5
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	vpslld	xmm5,xmm5,11
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [16+esp]
+	vpxor	xmm4,xmm4,xmm6
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm7,10
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	vpxor	xmm4,xmm4,xmm5
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	vpaddd	xmm0,xmm0,xmm4
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [28+esp],ebx
+	vpxor	xmm6,xmm6,xmm5
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	vpsrlq	xmm7,xmm7,19
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	vpxor	xmm6,xmm6,xmm7
+	add	edx,DWORD [36+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	vpshufd	xmm7,xmm6,132
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	vpsrldq	xmm7,xmm7,8
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [12+esp]
+	vpaddd	xmm0,xmm0,xmm7
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	vpshufd	xmm7,xmm0,80
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	vpsrld	xmm6,xmm7,10
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	vpxor	xmm6,xmm6,xmm5
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [24+esp],eax
+	vpsrlq	xmm7,xmm7,19
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	vpxor	xmm6,xmm6,xmm7
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpshufd	xmm7,xmm6,232
+	add	edx,DWORD [40+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpslldq	xmm7,xmm7,8
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	vpaddd	xmm0,xmm0,xmm7
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [8+esp]
+	vpaddd	xmm6,xmm0,[ebp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [44+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	vmovdqa	[32+esp],xmm6
+	vpalignr	xmm4,xmm2,xmm1,4
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [4+esp]
+	vpalignr	xmm7,xmm0,xmm3,4
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm4,7
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	vpaddd	xmm1,xmm1,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrld	xmm7,xmm4,3
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	vpslld	xmm5,xmm4,14
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [16+esp],eax
+	vpxor	xmm4,xmm7,xmm6
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	vpshufd	xmm7,xmm0,250
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpsrld	xmm6,xmm6,11
+	add	edx,DWORD [48+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpxor	xmm4,xmm4,xmm5
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	vpslld	xmm5,xmm5,11
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [esp]
+	vpxor	xmm4,xmm4,xmm6
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm7,10
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	vpxor	xmm4,xmm4,xmm5
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	vpaddd	xmm1,xmm1,xmm4
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [12+esp],ebx
+	vpxor	xmm6,xmm6,xmm5
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	vpsrlq	xmm7,xmm7,19
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	vpxor	xmm6,xmm6,xmm7
+	add	edx,DWORD [52+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	vpshufd	xmm7,xmm6,132
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	vpsrldq	xmm7,xmm7,8
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [28+esp]
+	vpaddd	xmm1,xmm1,xmm7
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	vpshufd	xmm7,xmm1,80
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	vpsrld	xmm6,xmm7,10
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	vpxor	xmm6,xmm6,xmm5
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [8+esp],eax
+	vpsrlq	xmm7,xmm7,19
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	vpxor	xmm6,xmm6,xmm7
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpshufd	xmm7,xmm6,232
+	add	edx,DWORD [56+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpslldq	xmm7,xmm7,8
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	vpaddd	xmm1,xmm1,xmm7
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [24+esp]
+	vpaddd	xmm6,xmm1,[16+ebp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [60+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	vmovdqa	[48+esp],xmm6
+	vpalignr	xmm4,xmm3,xmm2,4
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [20+esp]
+	vpalignr	xmm7,xmm1,xmm0,4
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm4,7
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	vpaddd	xmm2,xmm2,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrld	xmm7,xmm4,3
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	vpslld	xmm5,xmm4,14
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [esp],eax
+	vpxor	xmm4,xmm7,xmm6
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	vpshufd	xmm7,xmm1,250
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpsrld	xmm6,xmm6,11
+	add	edx,DWORD [64+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpxor	xmm4,xmm4,xmm5
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	vpslld	xmm5,xmm5,11
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [16+esp]
+	vpxor	xmm4,xmm4,xmm6
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm7,10
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	vpxor	xmm4,xmm4,xmm5
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	vpaddd	xmm2,xmm2,xmm4
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [28+esp],ebx
+	vpxor	xmm6,xmm6,xmm5
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	vpsrlq	xmm7,xmm7,19
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	vpxor	xmm6,xmm6,xmm7
+	add	edx,DWORD [68+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	vpshufd	xmm7,xmm6,132
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	vpsrldq	xmm7,xmm7,8
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [12+esp]
+	vpaddd	xmm2,xmm2,xmm7
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	vpshufd	xmm7,xmm2,80
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	vpsrld	xmm6,xmm7,10
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	vpxor	xmm6,xmm6,xmm5
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [24+esp],eax
+	vpsrlq	xmm7,xmm7,19
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	vpxor	xmm6,xmm6,xmm7
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpshufd	xmm7,xmm6,232
+	add	edx,DWORD [72+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpslldq	xmm7,xmm7,8
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	vpaddd	xmm2,xmm2,xmm7
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [8+esp]
+	vpaddd	xmm6,xmm2,[32+ebp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [76+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	vmovdqa	[64+esp],xmm6
+	vpalignr	xmm4,xmm0,xmm3,4
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [4+esp]
+	vpalignr	xmm7,xmm2,xmm1,4
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm4,7
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	vpaddd	xmm3,xmm3,xmm7
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrld	xmm7,xmm4,3
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	vpslld	xmm5,xmm4,14
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [16+esp],eax
+	vpxor	xmm4,xmm7,xmm6
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	vpshufd	xmm7,xmm2,250
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpsrld	xmm6,xmm6,11
+	add	edx,DWORD [80+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpxor	xmm4,xmm4,xmm5
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	vpslld	xmm5,xmm5,11
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [esp]
+	vpxor	xmm4,xmm4,xmm6
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	vpsrld	xmm6,xmm7,10
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	vpxor	xmm4,xmm4,xmm5
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	vpaddd	xmm3,xmm3,xmm4
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [12+esp],ebx
+	vpxor	xmm6,xmm6,xmm5
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	vpsrlq	xmm7,xmm7,19
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	vpxor	xmm6,xmm6,xmm7
+	add	edx,DWORD [84+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	vpshufd	xmm7,xmm6,132
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	vpsrldq	xmm7,xmm7,8
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [28+esp]
+	vpaddd	xmm3,xmm3,xmm7
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	vpshufd	xmm7,xmm3,80
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	vpsrld	xmm6,xmm7,10
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	vpsrlq	xmm5,xmm7,17
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	vpxor	xmm6,xmm6,xmm5
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [8+esp],eax
+	vpsrlq	xmm7,xmm7,19
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	vpxor	xmm6,xmm6,xmm7
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	vpshufd	xmm7,xmm6,232
+	add	edx,DWORD [88+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	vpslldq	xmm7,xmm7,8
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	vpaddd	xmm3,xmm3,xmm7
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [24+esp]
+	vpaddd	xmm6,xmm3,[48+ebp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [92+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	vmovdqa	[80+esp],xmm6
+	cmp	DWORD [64+ebp],66051
+	jne	NEAR L$012avx_00_47
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [20+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [32+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [36+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [24+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [40+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [44+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [4+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [48+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [52+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [8+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [56+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [60+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [20+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [24+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [16+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [4+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [28+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [64+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [12+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [16+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [20+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [12+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [28+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [24+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [68+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [8+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [12+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [16+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [8+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [28+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [24+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [20+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [72+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [4+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [8+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [12+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [4+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [24+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [20+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [16+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [76+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [4+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [8+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [20+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [16+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [12+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [80+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [28+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [esp]
+	xor	edx,ecx
+	mov	edi,DWORD [4+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [28+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [16+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [12+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [8+esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [84+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [24+esp]
+	add	eax,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [28+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [24+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,eax
+	add	edx,edi
+	mov	edi,DWORD [12+esp]
+	mov	esi,eax
+	shrd	ecx,ecx,9
+	mov	DWORD [8+esp],eax
+	xor	ecx,eax
+	xor	eax,edi
+	add	edx,DWORD [4+esp]
+	shrd	ecx,ecx,11
+	and	ebx,eax
+	xor	ecx,esi
+	add	edx,DWORD [88+esp]
+	xor	ebx,edi
+	shrd	ecx,ecx,2
+	add	ebx,edx
+	add	edx,DWORD [20+esp]
+	add	ebx,ecx
+	mov	ecx,edx
+	shrd	edx,edx,14
+	mov	esi,DWORD [24+esp]
+	xor	edx,ecx
+	mov	edi,DWORD [28+esp]
+	xor	esi,edi
+	shrd	edx,edx,5
+	and	esi,ecx
+	mov	DWORD [20+esp],ecx
+	xor	edx,ecx
+	xor	edi,esi
+	shrd	edx,edx,6
+	mov	ecx,ebx
+	add	edx,edi
+	mov	edi,DWORD [8+esp]
+	mov	esi,ebx
+	shrd	ecx,ecx,9
+	mov	DWORD [4+esp],ebx
+	xor	ecx,ebx
+	xor	ebx,edi
+	add	edx,DWORD [esp]
+	shrd	ecx,ecx,11
+	and	eax,ebx
+	xor	ecx,esi
+	add	edx,DWORD [92+esp]
+	xor	eax,edi
+	shrd	ecx,ecx,2
+	add	eax,edx
+	add	edx,DWORD [16+esp]
+	add	eax,ecx
+	mov	esi,DWORD [96+esp]
+	xor	ebx,edi
+	mov	ecx,DWORD [12+esp]
+	add	eax,DWORD [esi]
+	add	ebx,DWORD [4+esi]
+	add	edi,DWORD [8+esi]
+	add	ecx,DWORD [12+esi]
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],ebx
+	mov	DWORD [8+esi],edi
+	mov	DWORD [12+esi],ecx
+	mov	DWORD [4+esp],ebx
+	xor	ebx,edi
+	mov	DWORD [8+esp],edi
+	mov	DWORD [12+esp],ecx
+	mov	edi,DWORD [20+esp]
+	mov	ecx,DWORD [24+esp]
+	add	edx,DWORD [16+esi]
+	add	edi,DWORD [20+esi]
+	add	ecx,DWORD [24+esi]
+	mov	DWORD [16+esi],edx
+	mov	DWORD [20+esi],edi
+	mov	DWORD [20+esp],edi
+	mov	edi,DWORD [28+esp]
+	mov	DWORD [24+esi],ecx
+	add	edi,DWORD [28+esi]
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esi],edi
+	mov	DWORD [28+esp],edi
+	mov	edi,DWORD [100+esp]
+	vmovdqa	xmm7,[64+ebp]
+	sub	ebp,192
+	cmp	edi,DWORD [104+esp]
+	jb	NEAR L$011grand_avx
+	mov	esp,DWORD [108+esp]
+	vzeroall
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha256-armv4-linux.S b/gen/bcm/sha256-armv4-linux.S
new file mode 100644
index 0000000..fca0681
--- /dev/null
+++ b/gen/bcm/sha256-armv4-linux.S
@@ -0,0 +1,2839 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.align	5
+
+.globl	sha256_block_data_order_nohw
+.hidden	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	adr	r14,K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_neon+4)
+#else
+.word	K256-(.LK256_add_neon+8)
+#endif
+
+.globl	sha256_block_data_order_neon
+.hidden	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+	sub	r11,sp,#16*4+16
+
+	@ K256 is just at the boundary of being easily referenced by an ADR from
+	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+	@ not fit. By moving code around, we could make it fit, but this is too
+	@ fragile. For simplicity, just load the offset from
+	@ .LK256_shortcut_neon.
+	@
+	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+	@ support it. We might be able to emulate it with a macro, but Android's
+	@ did not work when I tried it.
+	@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
+	ldr	r14,.LK256_shortcut_neon
+.LK256_add_neon:
+	add	r14,pc,r14
+
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8	{q0},[r1]!
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	vld1.32	{q8},[r14,:128]!
+	vld1.32	{q9},[r14,:128]!
+	vld1.32	{q10},[r14,:128]!
+	vld1.32	{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str	r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str	r1,[sp,#68]
+	mov	r1,sp
+	vrev32.8	q2,q2
+	str	r2,[sp,#72]
+	vrev32.8	q3,q3
+	str	r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32	{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32	{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32	{q10},[r1,:128]!
+	vst1.32	{q11},[r1,:128]!
+
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r1,r1,#64
+	ldr	r2,[sp,#0]
+	eor	r12,r12,r12
+	eor	r3,r5,r6
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr	r1,[sp,#68]
+	ldr	r0,[sp,#72]
+	sub	r14,r14,#256	@ rewind r14
+	teq	r1,r0
+	it	eq
+	subeq	r1,r1,#64		@ avoid SEGV
+	vld1.8	{q0},[r1]!		@ load next input block
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	it	ne
+	strne	r1,[sp,#68]
+	mov	r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8,r9,r10,r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.LK256_shortcut_hw:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_hw+4)
+#else
+.word	K256-(.LK256_add_hw+8)
+#endif
+
+.globl	sha256_block_data_order_hw
+.hidden	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	5
+sha256_block_data_order_hw:
+	@ K256 is too far to reference from one ADR command in Thumb mode. In
+	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+	@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
+	ldr	r3,.LK256_shortcut_hw
+.LK256_add_hw:
+	add	r3,pc,r3
+
+	vld1.32	{q0,q1},[r0]
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	b	.Loop_v8
+
+.align	4
+.Loop_v8:
+	vld1.8	{q8,q9},[r1]!
+	vld1.8	{q10,q11},[r1]!
+	vld1.32	{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov	q14,q0	@ offload
+	vmov	q15,q1
+	teq	r1,r2
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32	{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub	r3,r3,#256-16	@ rewind
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it	ne
+	bne	.Loop_v8
+
+	vst1.32	{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha256-armv8-apple.S b/gen/bcm/sha256-armv8-apple.S
new file mode 100644
index 0000000..a78236b
--- /dev/null
+++ b/gen/bcm/sha256-armv8-apple.S
@@ -0,0 +1,1193 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	_sha256_block_data_order_nohw
+.private_extern	_sha256_block_data_order_nohw
+
+.align	6
+_sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256@PAGE
+	add	x30,x30,LK256@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	_sha256_block_data_order_hw
+.private_extern	_sha256_block_data_order_hw
+
+.align	6
+_sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256@PAGE
+	add	x3,x3,LK256@PAGEOFF
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha256-armv8-linux.S b/gen/bcm/sha256-armv8-linux.S
new file mode 100644
index 0000000..4420108
--- /dev/null
+++ b/gen/bcm/sha256-armv8-linux.S
@@ -0,0 +1,1193 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	sha256_block_data_order_nohw
+.hidden	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+.align	6
+sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,.LK256
+	add	x30,x30,:lo12:.LK256
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+.Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+
+.section	.rodata
+.align	6
+.type	.LK256,%object
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+.size	.LK256,.-.LK256
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+.hidden	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,.LK256
+	add	x3,x3,:lo12:.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha256-armv8-win.S b/gen/bcm/sha256-armv8-win.S
new file mode 100644
index 0000000..89d3944
--- /dev/null
+++ b/gen/bcm/sha256-armv8-win.S
@@ -0,0 +1,1197 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	sha256_block_data_order_nohw
+
+.def sha256_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256
+	add	x30,x30,:lo12:LK256
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+
+.def sha256_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256
+	add	x3,x3,:lo12:LK256
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha256-x86_64-apple.S b/gen/bcm/sha256-x86_64-apple.S
new file mode 100644
index 0000000..b33f807
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-apple.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_sha256_block_data_order_nohw
+.private_extern _sha256_block_data_order_nohw
+
+.p2align	4
+_sha256_block_data_order_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	jmp	L$rounds_16_xx
+.p2align	4
+L$rounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	L$rounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop
+
+	movq	88(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue:
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_sha256_block_data_order_hw
+.private_extern _sha256_block_data_order_hw
+
+.p2align	6
+_sha256_block_data_order_hw:
+
+_CET_ENDBR
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	L$oop_shaext
+
+.p2align	4
+L$oop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	L$oop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	ret
+
+
+.globl	_sha256_block_data_order_ssse3
+.private_extern _sha256_block_data_order_ssse3
+
+.p2align	6
+_sha256_block_data_order_ssse3:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	L$loop_ssse3
+.p2align	4
+L$loop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	L$ssse3_00_47
+
+.p2align	4
+L$ssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	L$ssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop_ssse3
+
+	movq	88(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_ssse3:
+	ret
+
+
+.globl	_sha256_block_data_order_avx
+.private_extern _sha256_block_data_order_avx
+
+.p2align	6
+_sha256_block_data_order_avx:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+
+L$prologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	L$loop_avx
+.p2align	4
+L$loop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	L$avx_00_47
+
+.p2align	4
+L$avx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	L$avx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop_avx
+
+	movq	88(%rsp),%rsi
+
+	vzeroupper
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_avx:
+	ret
+
+
+#endif
diff --git a/gen/bcm/sha256-x86_64-linux.S b/gen/bcm/sha256-x86_64-linux.S
new file mode 100644
index 0000000..8476b03
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-linux.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,@function
+.align	16
+sha256_block_data_order_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+.section	.rodata
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	sha256_block_data_order_hw
+.hidden sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,@function
+.align	64
+sha256_block_data_order_hw:
+.cfi_startproc	
+_CET_ENDBR
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+.globl	sha256_block_data_order_ssse3
+.hidden sha256_block_data_order_ssse3
+.type	sha256_block_data_order_ssse3,@function
+.align	64
+sha256_block_data_order_ssse3:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.globl	sha256_block_data_order_avx
+.hidden sha256_block_data_order_avx
+.type	sha256_block_data_order_avx,@function
+.align	64
+sha256_block_data_order_avx:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lavx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_avx
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc	
+.size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
+#endif
diff --git a/gen/bcm/sha256-x86_64-win.asm b/gen/bcm/sha256-x86_64-win.asm
new file mode 100644
index 0000000..ada8dba
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-win.asm
@@ -0,0 +1,4415 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+global	sha256_block_data_order_nohw
+
+ALIGN	16
+sha256_block_data_order_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*4+4*8
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+$L$prologue:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	edi,ebx
+	lea	rbp,[K256]
+	xor	edi,ecx
+	mov	r12d,DWORD[rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[4+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[8+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[12+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[16+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[20+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[24+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[28+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	add	eax,r14d
+	mov	r12d,DWORD[32+rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[36+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[40+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[44+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[48+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[52+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[56+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[60+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13d,DWORD[4+rsp]
+	mov	r15d,DWORD[56+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[36+rsp]
+
+	add	r12d,DWORD[rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[8+rsp]
+	mov	edi,DWORD[60+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[40+rsp]
+
+	add	r12d,DWORD[4+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[12+rsp]
+	mov	r15d,DWORD[rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[44+rsp]
+
+	add	r12d,DWORD[8+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[16+rsp]
+	mov	edi,DWORD[4+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[48+rsp]
+
+	add	r12d,DWORD[12+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[20+rsp]
+	mov	r15d,DWORD[8+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[52+rsp]
+
+	add	r12d,DWORD[16+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[24+rsp]
+	mov	edi,DWORD[12+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[56+rsp]
+
+	add	r12d,DWORD[20+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[28+rsp]
+	mov	r15d,DWORD[16+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[60+rsp]
+
+	add	r12d,DWORD[24+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[32+rsp]
+	mov	edi,DWORD[20+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[rsp]
+
+	add	r12d,DWORD[28+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[36+rsp]
+	mov	r15d,DWORD[24+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[4+rsp]
+
+	add	r12d,DWORD[32+rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[40+rsp]
+	mov	edi,DWORD[28+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[8+rsp]
+
+	add	r12d,DWORD[36+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[44+rsp]
+	mov	r15d,DWORD[32+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[12+rsp]
+
+	add	r12d,DWORD[40+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[48+rsp]
+	mov	edi,DWORD[36+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[16+rsp]
+
+	add	r12d,DWORD[44+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[52+rsp]
+	mov	r15d,DWORD[40+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[20+rsp]
+
+	add	r12d,DWORD[48+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[56+rsp]
+	mov	edi,DWORD[44+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[24+rsp]
+
+	add	r12d,DWORD[52+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[60+rsp]
+	mov	r15d,DWORD[48+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[28+rsp]
+
+	add	r12d,DWORD[56+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[rsp]
+	mov	edi,DWORD[52+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[32+rsp]
+
+	add	r12d,DWORD[60+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	cmp	BYTE[3+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((64+0))+rsp]
+	add	eax,r14d
+	lea	rsi,[64+rsi]
+
+	add	eax,DWORD[rdi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[88+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_nohw:
+section	.rdata rdata align=8
+ALIGN	64
+
+K256:
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	DB	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+	DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+	DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+	DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+	DB	111,114,103,62,0
+section	.text
+
+global	sha256_block_data_order_hw
+
+ALIGN	64
+sha256_block_data_order_hw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_hw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-88))+rsp]
+	movaps	XMMWORD[(-8-80)+rax],xmm6
+	movaps	XMMWORD[(-8-64)+rax],xmm7
+	movaps	XMMWORD[(-8-48)+rax],xmm8
+	movaps	XMMWORD[(-8-32)+rax],xmm9
+	movaps	XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+	lea	rcx,[((K256+128))]
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm7,XMMWORD[((512-128))+rcx]
+
+	pshufd	xmm0,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	pshufd	xmm2,xmm2,0x1b
+	movdqa	xmm8,xmm7
+DB	102,15,58,15,202,8
+	punpcklqdq	xmm2,xmm0
+	jmp	NEAR $L$oop_shaext
+
+ALIGN	16
+$L$oop_shaext:
+	movdqu	xmm3,XMMWORD[rsi]
+	movdqu	xmm4,XMMWORD[16+rsi]
+	movdqu	xmm5,XMMWORD[32+rsi]
+DB	102,15,56,0,223
+	movdqu	xmm6,XMMWORD[48+rsi]
+
+	movdqa	xmm0,XMMWORD[((0-128))+rcx]
+	paddd	xmm0,xmm3
+DB	102,15,56,0,231
+	movdqa	xmm10,xmm2
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	nop
+	movdqa	xmm9,xmm1
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((32-128))+rcx]
+	paddd	xmm0,xmm4
+DB	102,15,56,0,239
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	lea	rsi,[64+rsi]
+	DB	15,56,204,220
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((64-128))+rcx]
+	paddd	xmm0,xmm5
+DB	102,15,56,0,247
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((96-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((128-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((160-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+	DB	15,56,204,220
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((192-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,205,245
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((224-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((256-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((288-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+	DB	15,56,204,220
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((320-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,205,245
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+	DB	15,56,204,229
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((352-128))+rcx]
+	paddd	xmm0,xmm6
+	DB	15,56,205,222
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+	DB	15,56,204,238
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((384-128))+rcx]
+	paddd	xmm0,xmm3
+	DB	15,56,205,227
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+	DB	15,56,204,243
+	DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((416-128))+rcx]
+	paddd	xmm0,xmm4
+	DB	15,56,205,236
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	DB	15,56,203,202
+	paddd	xmm6,xmm7
+
+	movdqa	xmm0,XMMWORD[((448-128))+rcx]
+	paddd	xmm0,xmm5
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	DB	15,56,205,245
+	movdqa	xmm7,xmm8
+	DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((480-128))+rcx]
+	paddd	xmm0,xmm6
+	nop
+	DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	dec	rdx
+	nop
+	DB	15,56,203,202
+
+	paddd	xmm2,xmm10
+	paddd	xmm1,xmm9
+	jnz	NEAR $L$oop_shaext
+
+	pshufd	xmm2,xmm2,0xb1
+	pshufd	xmm7,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	punpckhqdq	xmm1,xmm2
+DB	102,15,58,15,215,8
+
+	movdqu	XMMWORD[rdi],xmm1
+	movdqu	XMMWORD[16+rdi],xmm2
+	movaps	xmm6,XMMWORD[((-8-80))+rax]
+	movaps	xmm7,XMMWORD[((-8-64))+rax]
+	movaps	xmm8,XMMWORD[((-8-48))+rax]
+	movaps	xmm9,XMMWORD[((-8-32))+rax]
+	movaps	xmm10,XMMWORD[((-8-16))+rax]
+	mov	rsp,rax
+$L$epilogue_shaext:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_hw:
+global	sha256_block_data_order_ssse3
+
+ALIGN	64
+sha256_block_data_order_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_ssse3:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+
+
+	jmp	NEAR $L$loop_ssse3
+ALIGN	16
+$L$loop_ssse3:
+	movdqa	xmm7,XMMWORD[((K256+512))]
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+DB	102,15,56,0,199
+	movdqu	xmm3,XMMWORD[48+rsi]
+	lea	rbp,[K256]
+DB	102,15,56,0,207
+	movdqa	xmm4,XMMWORD[rbp]
+	movdqa	xmm5,XMMWORD[32+rbp]
+DB	102,15,56,0,215
+	paddd	xmm4,xmm0
+	movdqa	xmm6,XMMWORD[64+rbp]
+DB	102,15,56,0,223
+	movdqa	xmm7,XMMWORD[96+rbp]
+	paddd	xmm5,xmm1
+	paddd	xmm6,xmm2
+	paddd	xmm7,xmm3
+	movdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	movdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	movdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$ssse3_00_47
+
+ALIGN	16
+$L$ssse3_00_47:
+	sub	rbp,-128
+	ror	r13d,14
+	movdqa	xmm4,xmm1
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm3
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,224,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,250,4
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm3,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm0,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm0,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm0,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm0
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm2
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm0
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,225,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,251,4
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm0,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm1,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm1,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm1,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[32+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm1
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm3
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm1
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,226,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,248,4
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm1,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm2,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm2,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm2,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[64+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm2
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[32+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm0
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm2
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,227,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,249,4
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm2,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm3,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm3,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm3,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[96+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm3
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$ssse3_00_47
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_ssse3
+
+	mov	rsi,QWORD[88+rsp]
+
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_ssse3:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_ssse3:
+global	sha256_block_data_order_avx
+
+ALIGN	64
+sha256_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+	vzeroupper
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	vmovdqa	xmm8,XMMWORD[((K256+512+32))]
+	vmovdqa	xmm9,XMMWORD[((K256+512+64))]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm7,XMMWORD[((K256+512))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm0,xmm0,xmm7
+	lea	rbp,[K256]
+	vpshufb	xmm1,xmm1,xmm7
+	vpshufb	xmm2,xmm2,xmm7
+	vpaddd	xmm4,xmm0,XMMWORD[rbp]
+	vpshufb	xmm3,xmm3,xmm7
+	vpaddd	xmm5,xmm1,XMMWORD[32+rbp]
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	vpaddd	xmm7,xmm3,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	vmovdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	sub	rbp,-128
+	vpalignr	xmm4,xmm1,xmm0,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm3,xmm2,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm0,xmm0,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm3,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm0,xmm0,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm0,xmm0,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	vpshufd	xmm7,xmm0,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm0,xmm0,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm0,XMMWORD[rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[rsp],xmm6
+	vpalignr	xmm4,xmm2,xmm1,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm0,xmm3,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm1,xmm1,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm0,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm1,xmm1,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm1,xmm1,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	vpshufd	xmm7,xmm1,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm1,xmm1,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm1,XMMWORD[32+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm6
+	vpalignr	xmm4,xmm3,xmm2,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm1,xmm0,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm2,xmm2,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm1,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm2,xmm2,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm2,xmm2,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	vpshufd	xmm7,xmm2,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm2,xmm2,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	vpalignr	xmm4,xmm0,xmm3,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm2,xmm1,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm3,xmm3,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm2,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm3,xmm3,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm3,xmm3,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	vpshufd	xmm7,xmm3,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm3,xmm3,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm3,XMMWORD[96+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[88+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha256_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((64+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((64+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,8
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+ALIGN	16
+shaext_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue_shaext]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	r10,[$L$epilogue_shaext]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[((-8-80))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,10
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$in_prologue
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_hw wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_sha256_block_data_order_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_hw:
+	DB	9,0,0,0
+	DD	shaext_handler wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_ssse3:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S
new file mode 100644
index 0000000..cfdeac1
--- /dev/null
+++ b/gen/bcm/sha512-586-apple.S
@@ -0,0 +1,2837 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha512_block_data_order
+.private_extern	_sha512_block_data_order
+.align	4
+_sha512_block_data_order:
+L_sha512_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L000pic_point
+L000pic_point:
+	popl	%ebp
+	leal	L001K512-L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
+	movl	(%edx),%ecx
+	testl	$67108864,%ecx
+	jz	L002loop_x86
+	movl	4(%edx),%edx
+	movq	(%esi),%mm0
+	andl	$16777216,%ecx
+	movq	8(%esi),%mm1
+	andl	$512,%edx
+	movq	16(%esi),%mm2
+	orl	%edx,%ecx
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	cmpl	$16777728,%ecx
+	je	L003SSSE3
+	subl	$80,%esp
+	jmp	L004loop_sse2
+.align	4,0x90
+L004loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	L00500_14_sse2
+.align	4,0x90
+L00500_14_sse2:
+	movd	%eax,%mm1
+	movl	(%edi),%eax
+	movd	%ebx,%mm7
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	movq	48(%esp),%mm6
+	decl	%edx
+	jnz	L00500_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	L00616_79_sse2
+.align	4,0x90
+L00616_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	L00616_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	L004loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	5,0x90
+L003SSSE3:
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	5,0x90
+L007loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	L00800_47_ssse3
+.align	5,0x90
+L00800_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
+	movq	%mm4,%mm1
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	L00800_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movq	%mm4,%mm1
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
+	movq	8(%esp),%mm1
+	paddq	%mm3,%mm0
+	movq	24(%esp),%mm3
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	cmpl	%eax,%edi
+	jb	L007loop_ssse3
+	movl	76(%edx),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4,0x90
+L002loop_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	64(%edi),%eax
+	movl	68(%edi),%ebx
+	movl	72(%edi),%ecx
+	movl	76(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	80(%edi),%eax
+	movl	84(%edi),%ebx
+	movl	88(%edi),%ecx
+	movl	92(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	96(%edi),%eax
+	movl	100(%edi),%ebx
+	movl	104(%edi),%ecx
+	movl	108(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	112(%edi),%eax
+	movl	116(%edi),%ebx
+	movl	120(%edi),%ecx
+	movl	124(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$128,%edi
+	subl	$72,%esp
+	movl	%edi,204(%esp)
+	leal	8(%esp),%edi
+	movl	$16,%ecx
+.long	2784229001
+.align	4,0x90
+L00900_15_x86:
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$148,%dl
+	jne	L00900_15_x86
+.align	4,0x90
+L01016_79_x86:
+	movl	312(%esp),%ecx
+	movl	316(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$1,%ecx
+	movl	%edx,%edi
+	shrl	$1,%edx
+	movl	%ecx,%eax
+	shll	$24,%esi
+	movl	%edx,%ebx
+	shll	$24,%edi
+	xorl	%esi,%ebx
+	shrl	$6,%ecx
+	xorl	%edi,%eax
+	shrl	$6,%edx
+	xorl	%ecx,%eax
+	shll	$7,%esi
+	xorl	%edx,%ebx
+	shll	$1,%edi
+	xorl	%esi,%ebx
+	shrl	$1,%ecx
+	xorl	%edi,%eax
+	shrl	$1,%edx
+	xorl	%ecx,%eax
+	shll	$6,%edi
+	xorl	%edx,%ebx
+	xorl	%edi,%eax
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	208(%esp),%ecx
+	movl	212(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$6,%ecx
+	movl	%edx,%edi
+	shrl	$6,%edx
+	movl	%ecx,%eax
+	shll	$3,%esi
+	movl	%edx,%ebx
+	shll	$3,%edi
+	xorl	%esi,%eax
+	shrl	$13,%ecx
+	xorl	%edi,%ebx
+	shrl	$13,%edx
+	xorl	%ecx,%eax
+	shll	$10,%esi
+	xorl	%edx,%ebx
+	shll	$10,%edi
+	xorl	%esi,%ebx
+	shrl	$10,%ecx
+	xorl	%edi,%eax
+	shrl	$10,%edx
+	xorl	%ecx,%ebx
+	shll	$13,%edi
+	xorl	%edx,%eax
+	xorl	%edi,%eax
+	movl	320(%esp),%ecx
+	movl	324(%esp),%edx
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	movl	248(%esp),%esi
+	movl	252(%esp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,192(%esp)
+	movl	%ebx,196(%esp)
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$23,%dl
+	jne	L01016_79_x86
+	movl	840(%esp),%esi
+	movl	844(%esp),%edi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	addl	8(%esp),%eax
+	adcl	12(%esp),%ebx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	addl	16(%esp),%ecx
+	adcl	20(%esp),%edx
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	addl	24(%esp),%eax
+	adcl	28(%esp),%ebx
+	movl	%eax,16(%esi)
+	movl	%ebx,20(%esi)
+	addl	32(%esp),%ecx
+	adcl	36(%esp),%edx
+	movl	%ecx,24(%esi)
+	movl	%edx,28(%esi)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	addl	40(%esp),%eax
+	adcl	44(%esp),%ebx
+	movl	%eax,32(%esi)
+	movl	%ebx,36(%esi)
+	addl	48(%esp),%ecx
+	adcl	52(%esp),%edx
+	movl	%ecx,40(%esi)
+	movl	%edx,44(%esi)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	addl	56(%esp),%eax
+	adcl	60(%esp),%ebx
+	movl	%eax,48(%esi)
+	movl	%ebx,52(%esi)
+	addl	64(%esp),%ecx
+	adcl	68(%esp),%edx
+	movl	%ecx,56(%esi)
+	movl	%edx,60(%esi)
+	addl	$840,%esp
+	subl	$640,%ebp
+	cmpl	8(%esp),%edi
+	jb	L002loop_x86
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+L001K512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol	_OPENSSL_ia32cap_P
+.long	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S
new file mode 100644
index 0000000..bb2884d
--- /dev/null
+++ b/gen/bcm/sha512-586-linux.S
@@ -0,0 +1,2835 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	sha512_block_data_order
+.hidden	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K512-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+	movl	(%edx),%ecx
+	testl	$67108864,%ecx
+	jz	.L002loop_x86
+	movl	4(%edx),%edx
+	movq	(%esi),%mm0
+	andl	$16777216,%ecx
+	movq	8(%esi),%mm1
+	andl	$512,%edx
+	movq	16(%esi),%mm2
+	orl	%edx,%ecx
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	cmpl	$16777728,%ecx
+	je	.L003SSSE3
+	subl	$80,%esp
+	jmp	.L004loop_sse2
+.align	16
+.L004loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	.L00500_14_sse2
+.align	16
+.L00500_14_sse2:
+	movd	%eax,%mm1
+	movl	(%edi),%eax
+	movd	%ebx,%mm7
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	movq	48(%esp),%mm6
+	decl	%edx
+	jnz	.L00500_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	.L00616_79_sse2
+.align	16
+.L00616_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	.L00616_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L004loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L003SSSE3:
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	32
+.L007loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	.L00800_47_ssse3
+.align	32
+.L00800_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
+	movq	%mm4,%mm1
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	.L00800_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movq	%mm4,%mm1
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
+	movq	8(%esp),%mm1
+	paddq	%mm3,%mm0
+	movq	24(%esp),%mm3
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	cmpl	%eax,%edi
+	jb	.L007loop_ssse3
+	movl	76(%edx),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L002loop_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	64(%edi),%eax
+	movl	68(%edi),%ebx
+	movl	72(%edi),%ecx
+	movl	76(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	80(%edi),%eax
+	movl	84(%edi),%ebx
+	movl	88(%edi),%ecx
+	movl	92(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	96(%edi),%eax
+	movl	100(%edi),%ebx
+	movl	104(%edi),%ecx
+	movl	108(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	112(%edi),%eax
+	movl	116(%edi),%ebx
+	movl	120(%edi),%ecx
+	movl	124(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$128,%edi
+	subl	$72,%esp
+	movl	%edi,204(%esp)
+	leal	8(%esp),%edi
+	movl	$16,%ecx
+.long	2784229001
+.align	16
+.L00900_15_x86:
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$148,%dl
+	jne	.L00900_15_x86
+.align	16
+.L01016_79_x86:
+	movl	312(%esp),%ecx
+	movl	316(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$1,%ecx
+	movl	%edx,%edi
+	shrl	$1,%edx
+	movl	%ecx,%eax
+	shll	$24,%esi
+	movl	%edx,%ebx
+	shll	$24,%edi
+	xorl	%esi,%ebx
+	shrl	$6,%ecx
+	xorl	%edi,%eax
+	shrl	$6,%edx
+	xorl	%ecx,%eax
+	shll	$7,%esi
+	xorl	%edx,%ebx
+	shll	$1,%edi
+	xorl	%esi,%ebx
+	shrl	$1,%ecx
+	xorl	%edi,%eax
+	shrl	$1,%edx
+	xorl	%ecx,%eax
+	shll	$6,%edi
+	xorl	%edx,%ebx
+	xorl	%edi,%eax
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	208(%esp),%ecx
+	movl	212(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$6,%ecx
+	movl	%edx,%edi
+	shrl	$6,%edx
+	movl	%ecx,%eax
+	shll	$3,%esi
+	movl	%edx,%ebx
+	shll	$3,%edi
+	xorl	%esi,%eax
+	shrl	$13,%ecx
+	xorl	%edi,%ebx
+	shrl	$13,%edx
+	xorl	%ecx,%eax
+	shll	$10,%esi
+	xorl	%edx,%ebx
+	shll	$10,%edi
+	xorl	%esi,%ebx
+	shrl	$10,%ecx
+	xorl	%edi,%eax
+	shrl	$10,%edx
+	xorl	%ecx,%ebx
+	shll	$13,%edi
+	xorl	%edx,%eax
+	xorl	%edi,%eax
+	movl	320(%esp),%ecx
+	movl	324(%esp),%edx
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	movl	248(%esp),%esi
+	movl	252(%esp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,192(%esp)
+	movl	%ebx,196(%esp)
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$23,%dl
+	jne	.L01016_79_x86
+	movl	840(%esp),%esi
+	movl	844(%esp),%edi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	addl	8(%esp),%eax
+	adcl	12(%esp),%ebx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	addl	16(%esp),%ecx
+	adcl	20(%esp),%edx
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	addl	24(%esp),%eax
+	adcl	28(%esp),%ebx
+	movl	%eax,16(%esi)
+	movl	%ebx,20(%esi)
+	addl	32(%esp),%ecx
+	adcl	36(%esp),%edx
+	movl	%ecx,24(%esi)
+	movl	%edx,28(%esi)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	addl	40(%esp),%eax
+	adcl	44(%esp),%ebx
+	movl	%eax,32(%esi)
+	movl	%ebx,36(%esi)
+	addl	48(%esp),%ecx
+	adcl	52(%esp),%edx
+	movl	%ecx,40(%esi)
+	movl	%edx,44(%esi)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	addl	56(%esp),%eax
+	adcl	60(%esp),%ebx
+	movl	%eax,48(%esi)
+	movl	%ebx,52(%esi)
+	addl	64(%esp),%ecx
+	adcl	68(%esp),%edx
+	movl	%ecx,56(%esi)
+	movl	%edx,60(%esi)
+	addl	$840,%esp
+	subl	$640,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop_x86
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
+.size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm
new file mode 100644
index 0000000..3603a6d
--- /dev/null
+++ b/gen/bcm/sha512-586-win.asm
@@ -0,0 +1,2846 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_OPENSSL_ia32cap_P
+global	_sha512_block_data_order
+align	16
+_sha512_block_data_order:
+L$_sha512_block_data_order_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	ebx,esp
+	call	L$000pic_point
+L$000pic_point:
+	pop	ebp
+	lea	ebp,[(L$001K512-L$000pic_point)+ebp]
+	sub	esp,16
+	and	esp,-64
+	shl	eax,7
+	add	eax,edi
+	mov	DWORD [esp],esi
+	mov	DWORD [4+esp],edi
+	mov	DWORD [8+esp],eax
+	mov	DWORD [12+esp],ebx
+	lea	edx,[_OPENSSL_ia32cap_P]
+	mov	ecx,DWORD [edx]
+	test	ecx,67108864
+	jz	NEAR L$002loop_x86
+	mov	edx,DWORD [4+edx]
+	movq	mm0,[esi]
+	and	ecx,16777216
+	movq	mm1,[8+esi]
+	and	edx,512
+	movq	mm2,[16+esi]
+	or	ecx,edx
+	movq	mm3,[24+esi]
+	movq	mm4,[32+esi]
+	movq	mm5,[40+esi]
+	movq	mm6,[48+esi]
+	movq	mm7,[56+esi]
+	cmp	ecx,16777728
+	je	NEAR L$003SSSE3
+	sub	esp,80
+	jmp	NEAR L$004loop_sse2
+align	16
+L$004loop_sse2:
+	movq	[8+esp],mm1
+	movq	[16+esp],mm2
+	movq	[24+esp],mm3
+	movq	[40+esp],mm5
+	movq	[48+esp],mm6
+	pxor	mm2,mm1
+	movq	[56+esp],mm7
+	movq	mm3,mm0
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	add	edi,8
+	mov	edx,15
+	bswap	eax
+	bswap	ebx
+	jmp	NEAR L$00500_14_sse2
+align	16
+L$00500_14_sse2:
+	movd	mm1,eax
+	mov	eax,DWORD [edi]
+	movd	mm7,ebx
+	mov	ebx,DWORD [4+edi]
+	add	edi,8
+	bswap	eax
+	bswap	ebx
+	punpckldq	mm7,mm1
+	movq	mm1,mm4
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	movq	mm0,mm3
+	movq	[72+esp],mm7
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	paddq	mm7,[ebp]
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	sub	esp,8
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[40+esp]
+	paddq	mm3,mm2
+	movq	mm2,mm0
+	add	ebp,8
+	paddq	mm3,mm6
+	movq	mm6,[48+esp]
+	dec	edx
+	jnz	NEAR L$00500_14_sse2
+	movd	mm1,eax
+	movd	mm7,ebx
+	punpckldq	mm7,mm1
+	movq	mm1,mm4
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	movq	mm0,mm3
+	movq	[72+esp],mm7
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	paddq	mm7,[ebp]
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	sub	esp,8
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm7,[192+esp]
+	paddq	mm3,mm2
+	movq	mm2,mm0
+	add	ebp,8
+	paddq	mm3,mm6
+	pxor	mm0,mm0
+	mov	edx,32
+	jmp	NEAR L$00616_79_sse2
+align	16
+L$00616_79_sse2:
+	movq	mm5,[88+esp]
+	movq	mm1,mm7
+	psrlq	mm7,1
+	movq	mm6,mm5
+	psrlq	mm5,6
+	psllq	mm1,56
+	paddq	mm0,mm3
+	movq	mm3,mm7
+	psrlq	mm7,6
+	pxor	mm3,mm1
+	psllq	mm1,7
+	pxor	mm3,mm7
+	psrlq	mm7,1
+	pxor	mm3,mm1
+	movq	mm1,mm5
+	psrlq	mm5,13
+	pxor	mm7,mm3
+	psllq	mm6,3
+	pxor	mm1,mm5
+	paddq	mm7,[200+esp]
+	pxor	mm1,mm6
+	psrlq	mm5,42
+	paddq	mm7,[128+esp]
+	pxor	mm1,mm5
+	psllq	mm6,42
+	movq	mm5,[40+esp]
+	pxor	mm1,mm6
+	movq	mm6,[48+esp]
+	paddq	mm7,mm1
+	movq	mm1,mm4
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	movq	[72+esp],mm7
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	paddq	mm7,[ebp]
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	sub	esp,8
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm7,[192+esp]
+	paddq	mm2,mm6
+	add	ebp,8
+	movq	mm5,[88+esp]
+	movq	mm1,mm7
+	psrlq	mm7,1
+	movq	mm6,mm5
+	psrlq	mm5,6
+	psllq	mm1,56
+	paddq	mm2,mm3
+	movq	mm3,mm7
+	psrlq	mm7,6
+	pxor	mm3,mm1
+	psllq	mm1,7
+	pxor	mm3,mm7
+	psrlq	mm7,1
+	pxor	mm3,mm1
+	movq	mm1,mm5
+	psrlq	mm5,13
+	pxor	mm7,mm3
+	psllq	mm6,3
+	pxor	mm1,mm5
+	paddq	mm7,[200+esp]
+	pxor	mm1,mm6
+	psrlq	mm5,42
+	paddq	mm7,[128+esp]
+	pxor	mm1,mm5
+	psllq	mm6,42
+	movq	mm5,[40+esp]
+	pxor	mm1,mm6
+	movq	mm6,[48+esp]
+	paddq	mm7,mm1
+	movq	mm1,mm4
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	movq	[72+esp],mm7
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	paddq	mm7,[ebp]
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	sub	esp,8
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm7,[192+esp]
+	paddq	mm0,mm6
+	add	ebp,8
+	dec	edx
+	jnz	NEAR L$00616_79_sse2
+	paddq	mm0,mm3
+	movq	mm1,[8+esp]
+	movq	mm3,[24+esp]
+	movq	mm5,[40+esp]
+	movq	mm6,[48+esp]
+	movq	mm7,[56+esp]
+	pxor	mm2,mm1
+	paddq	mm0,[esi]
+	paddq	mm1,[8+esi]
+	paddq	mm2,[16+esi]
+	paddq	mm3,[24+esi]
+	paddq	mm4,[32+esi]
+	paddq	mm5,[40+esi]
+	paddq	mm6,[48+esi]
+	paddq	mm7,[56+esi]
+	mov	eax,640
+	movq	[esi],mm0
+	movq	[8+esi],mm1
+	movq	[16+esi],mm2
+	movq	[24+esi],mm3
+	movq	[32+esi],mm4
+	movq	[40+esi],mm5
+	movq	[48+esi],mm6
+	movq	[56+esi],mm7
+	lea	esp,[eax*1+esp]
+	sub	ebp,eax
+	cmp	edi,DWORD [88+esp]
+	jb	NEAR L$004loop_sse2
+	mov	esp,DWORD [92+esp]
+	emms
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	32
+L$003SSSE3:
+	lea	edx,[esp-64]
+	sub	esp,256
+	movdqa	xmm1,[640+ebp]
+	movdqu	xmm0,[edi]
+db	102,15,56,0,193
+	movdqa	xmm3,[ebp]
+	movdqa	xmm2,xmm1
+	movdqu	xmm1,[16+edi]
+	paddq	xmm3,xmm0
+db	102,15,56,0,202
+	movdqa	[edx-128],xmm3
+	movdqa	xmm4,[16+ebp]
+	movdqa	xmm3,xmm2
+	movdqu	xmm2,[32+edi]
+	paddq	xmm4,xmm1
+db	102,15,56,0,211
+	movdqa	[edx-112],xmm4
+	movdqa	xmm5,[32+ebp]
+	movdqa	xmm4,xmm3
+	movdqu	xmm3,[48+edi]
+	paddq	xmm5,xmm2
+db	102,15,56,0,220
+	movdqa	[edx-96],xmm5
+	movdqa	xmm6,[48+ebp]
+	movdqa	xmm5,xmm4
+	movdqu	xmm4,[64+edi]
+	paddq	xmm6,xmm3
+db	102,15,56,0,229
+	movdqa	[edx-80],xmm6
+	movdqa	xmm7,[64+ebp]
+	movdqa	xmm6,xmm5
+	movdqu	xmm5,[80+edi]
+	paddq	xmm7,xmm4
+db	102,15,56,0,238
+	movdqa	[edx-64],xmm7
+	movdqa	[edx],xmm0
+	movdqa	xmm0,[80+ebp]
+	movdqa	xmm7,xmm6
+	movdqu	xmm6,[96+edi]
+	paddq	xmm0,xmm5
+db	102,15,56,0,247
+	movdqa	[edx-48],xmm0
+	movdqa	[16+edx],xmm1
+	movdqa	xmm1,[96+ebp]
+	movdqa	xmm0,xmm7
+	movdqu	xmm7,[112+edi]
+	paddq	xmm1,xmm6
+db	102,15,56,0,248
+	movdqa	[edx-32],xmm1
+	movdqa	[32+edx],xmm2
+	movdqa	xmm2,[112+ebp]
+	movdqa	xmm0,[edx]
+	paddq	xmm2,xmm7
+	movdqa	[edx-16],xmm2
+	nop
+align	32
+L$007loop_ssse3:
+	movdqa	xmm2,[16+edx]
+	movdqa	[48+edx],xmm3
+	lea	ebp,[128+ebp]
+	movq	[8+esp],mm1
+	mov	ebx,edi
+	movq	[16+esp],mm2
+	lea	edi,[128+edi]
+	movq	[24+esp],mm3
+	cmp	edi,eax
+	movq	[40+esp],mm5
+	cmovb	ebx,edi
+	movq	[48+esp],mm6
+	mov	ecx,4
+	pxor	mm2,mm1
+	movq	[56+esp],mm7
+	pxor	mm3,mm3
+	jmp	NEAR L$00800_47_ssse3
+align	32
+L$00800_47_ssse3:
+	movdqa	xmm3,xmm5
+	movdqa	xmm1,xmm2
+db	102,15,58,15,208,8
+	movdqa	[edx],xmm4
+db	102,15,58,15,220,8
+	movdqa	xmm4,xmm2
+	psrlq	xmm2,7
+	paddq	xmm0,xmm3
+	movdqa	xmm3,xmm4
+	psrlq	xmm4,1
+	psllq	xmm3,56
+	pxor	xmm2,xmm4
+	psrlq	xmm4,7
+	pxor	xmm2,xmm3
+	psllq	xmm3,7
+	pxor	xmm2,xmm4
+	movdqa	xmm4,xmm7
+	pxor	xmm2,xmm3
+	movdqa	xmm3,xmm7
+	psrlq	xmm4,6
+	paddq	xmm0,xmm2
+	movdqa	xmm2,xmm7
+	psrlq	xmm3,19
+	psllq	xmm2,3
+	pxor	xmm4,xmm3
+	psrlq	xmm3,42
+	pxor	xmm4,xmm2
+	psllq	xmm2,42
+	pxor	xmm4,xmm3
+	movdqa	xmm3,[32+edx]
+	pxor	xmm4,xmm2
+	movdqa	xmm2,[ebp]
+	movq	mm1,mm4
+	paddq	xmm0,xmm4
+	movq	mm7,[edx-128]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	paddq	xmm2,xmm0
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[32+esp]
+	paddq	mm2,mm6
+	movq	mm6,[40+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-120]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[24+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[56+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[48+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[16+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[24+esp]
+	paddq	mm0,mm6
+	movq	mm6,[32+esp]
+	movdqa	[edx-128],xmm2
+	movdqa	xmm4,xmm6
+	movdqa	xmm2,xmm3
+db	102,15,58,15,217,8
+	movdqa	[16+edx],xmm5
+db	102,15,58,15,229,8
+	movdqa	xmm5,xmm3
+	psrlq	xmm3,7
+	paddq	xmm1,xmm4
+	movdqa	xmm4,xmm5
+	psrlq	xmm5,1
+	psllq	xmm4,56
+	pxor	xmm3,xmm5
+	psrlq	xmm5,7
+	pxor	xmm3,xmm4
+	psllq	xmm4,7
+	pxor	xmm3,xmm5
+	movdqa	xmm5,xmm0
+	pxor	xmm3,xmm4
+	movdqa	xmm4,xmm0
+	psrlq	xmm5,6
+	paddq	xmm1,xmm3
+	movdqa	xmm3,xmm0
+	psrlq	xmm4,19
+	psllq	xmm3,3
+	pxor	xmm5,xmm4
+	psrlq	xmm4,42
+	pxor	xmm5,xmm3
+	psllq	xmm3,42
+	pxor	xmm5,xmm4
+	movdqa	xmm4,[48+edx]
+	pxor	xmm5,xmm3
+	movdqa	xmm3,[16+ebp]
+	movq	mm1,mm4
+	paddq	xmm1,xmm5
+	movq	mm7,[edx-112]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[16+esp],mm4
+	paddq	xmm3,xmm1
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[48+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[40+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[8+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[56+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[16+esp]
+	paddq	mm2,mm6
+	movq	mm6,[24+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-104]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[8+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[40+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[32+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[48+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[8+esp]
+	paddq	mm0,mm6
+	movq	mm6,[16+esp]
+	movdqa	[edx-112],xmm3
+	movdqa	xmm5,xmm7
+	movdqa	xmm3,xmm4
+db	102,15,58,15,226,8
+	movdqa	[32+edx],xmm6
+db	102,15,58,15,238,8
+	movdqa	xmm6,xmm4
+	psrlq	xmm4,7
+	paddq	xmm2,xmm5
+	movdqa	xmm5,xmm6
+	psrlq	xmm6,1
+	psllq	xmm5,56
+	pxor	xmm4,xmm6
+	psrlq	xmm6,7
+	pxor	xmm4,xmm5
+	psllq	xmm5,7
+	pxor	xmm4,xmm6
+	movdqa	xmm6,xmm1
+	pxor	xmm4,xmm5
+	movdqa	xmm5,xmm1
+	psrlq	xmm6,6
+	paddq	xmm2,xmm4
+	movdqa	xmm4,xmm1
+	psrlq	xmm5,19
+	psllq	xmm4,3
+	pxor	xmm6,xmm5
+	psrlq	xmm5,42
+	pxor	xmm6,xmm4
+	psllq	xmm4,42
+	pxor	xmm6,xmm5
+	movdqa	xmm5,[edx]
+	pxor	xmm6,xmm4
+	movdqa	xmm4,[32+ebp]
+	movq	mm1,mm4
+	paddq	xmm2,xmm6
+	movq	mm7,[edx-96]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[esp],mm4
+	paddq	xmm4,xmm2
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[32+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[24+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[56+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[40+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[esp]
+	paddq	mm2,mm6
+	movq	mm6,[8+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-88]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[56+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[24+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[16+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[48+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[32+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[56+esp]
+	paddq	mm0,mm6
+	movq	mm6,[esp]
+	movdqa	[edx-96],xmm4
+	movdqa	xmm6,xmm0
+	movdqa	xmm4,xmm5
+db	102,15,58,15,235,8
+	movdqa	[48+edx],xmm7
+db	102,15,58,15,247,8
+	movdqa	xmm7,xmm5
+	psrlq	xmm5,7
+	paddq	xmm3,xmm6
+	movdqa	xmm6,xmm7
+	psrlq	xmm7,1
+	psllq	xmm6,56
+	pxor	xmm5,xmm7
+	psrlq	xmm7,7
+	pxor	xmm5,xmm6
+	psllq	xmm6,7
+	pxor	xmm5,xmm7
+	movdqa	xmm7,xmm2
+	pxor	xmm5,xmm6
+	movdqa	xmm6,xmm2
+	psrlq	xmm7,6
+	paddq	xmm3,xmm5
+	movdqa	xmm5,xmm2
+	psrlq	xmm6,19
+	psllq	xmm5,3
+	pxor	xmm7,xmm6
+	psrlq	xmm6,42
+	pxor	xmm7,xmm5
+	psllq	xmm5,42
+	pxor	xmm7,xmm6
+	movdqa	xmm6,[16+edx]
+	pxor	xmm7,xmm5
+	movdqa	xmm5,[48+ebp]
+	movq	mm1,mm4
+	paddq	xmm3,xmm7
+	movq	mm7,[edx-80]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[48+esp],mm4
+	paddq	xmm5,xmm3
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[16+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[8+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[40+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[24+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[48+esp]
+	paddq	mm2,mm6
+	movq	mm6,[56+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-72]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[40+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[8+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[32+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[16+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[40+esp]
+	paddq	mm0,mm6
+	movq	mm6,[48+esp]
+	movdqa	[edx-80],xmm5
+	movdqa	xmm7,xmm1
+	movdqa	xmm5,xmm6
+db	102,15,58,15,244,8
+	movdqa	[edx],xmm0
+db	102,15,58,15,248,8
+	movdqa	xmm0,xmm6
+	psrlq	xmm6,7
+	paddq	xmm4,xmm7
+	movdqa	xmm7,xmm0
+	psrlq	xmm0,1
+	psllq	xmm7,56
+	pxor	xmm6,xmm0
+	psrlq	xmm0,7
+	pxor	xmm6,xmm7
+	psllq	xmm7,7
+	pxor	xmm6,xmm0
+	movdqa	xmm0,xmm3
+	pxor	xmm6,xmm7
+	movdqa	xmm7,xmm3
+	psrlq	xmm0,6
+	paddq	xmm4,xmm6
+	movdqa	xmm6,xmm3
+	psrlq	xmm7,19
+	psllq	xmm6,3
+	pxor	xmm0,xmm7
+	psrlq	xmm7,42
+	pxor	xmm0,xmm6
+	psllq	xmm6,42
+	pxor	xmm0,xmm7
+	movdqa	xmm7,[32+edx]
+	pxor	xmm0,xmm6
+	movdqa	xmm6,[64+ebp]
+	movq	mm1,mm4
+	paddq	xmm4,xmm0
+	movq	mm7,[edx-64]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	paddq	xmm6,xmm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[32+esp]
+	paddq	mm2,mm6
+	movq	mm6,[40+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-56]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[24+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[56+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[48+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[16+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[24+esp]
+	paddq	mm0,mm6
+	movq	mm6,[32+esp]
+	movdqa	[edx-64],xmm6
+	movdqa	xmm0,xmm2
+	movdqa	xmm6,xmm7
+db	102,15,58,15,253,8
+	movdqa	[16+edx],xmm1
+db	102,15,58,15,193,8
+	movdqa	xmm1,xmm7
+	psrlq	xmm7,7
+	paddq	xmm5,xmm0
+	movdqa	xmm0,xmm1
+	psrlq	xmm1,1
+	psllq	xmm0,56
+	pxor	xmm7,xmm1
+	psrlq	xmm1,7
+	pxor	xmm7,xmm0
+	psllq	xmm0,7
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm4
+	pxor	xmm7,xmm0
+	movdqa	xmm0,xmm4
+	psrlq	xmm1,6
+	paddq	xmm5,xmm7
+	movdqa	xmm7,xmm4
+	psrlq	xmm0,19
+	psllq	xmm7,3
+	pxor	xmm1,xmm0
+	psrlq	xmm0,42
+	pxor	xmm1,xmm7
+	psllq	xmm7,42
+	pxor	xmm1,xmm0
+	movdqa	xmm0,[48+edx]
+	pxor	xmm1,xmm7
+	movdqa	xmm7,[80+ebp]
+	movq	mm1,mm4
+	paddq	xmm5,xmm1
+	movq	mm7,[edx-48]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[16+esp],mm4
+	paddq	xmm7,xmm5
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[48+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[40+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[8+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[56+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[16+esp]
+	paddq	mm2,mm6
+	movq	mm6,[24+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-40]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[8+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[40+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[32+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[48+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[8+esp]
+	paddq	mm0,mm6
+	movq	mm6,[16+esp]
+	movdqa	[edx-48],xmm7
+	movdqa	xmm1,xmm3
+	movdqa	xmm7,xmm0
+db	102,15,58,15,198,8
+	movdqa	[32+edx],xmm2
+db	102,15,58,15,202,8
+	movdqa	xmm2,xmm0
+	psrlq	xmm0,7
+	paddq	xmm6,xmm1
+	movdqa	xmm1,xmm2
+	psrlq	xmm2,1
+	psllq	xmm1,56
+	pxor	xmm0,xmm2
+	psrlq	xmm2,7
+	pxor	xmm0,xmm1
+	psllq	xmm1,7
+	pxor	xmm0,xmm2
+	movdqa	xmm2,xmm5
+	pxor	xmm0,xmm1
+	movdqa	xmm1,xmm5
+	psrlq	xmm2,6
+	paddq	xmm6,xmm0
+	movdqa	xmm0,xmm5
+	psrlq	xmm1,19
+	psllq	xmm0,3
+	pxor	xmm2,xmm1
+	psrlq	xmm1,42
+	pxor	xmm2,xmm0
+	psllq	xmm0,42
+	pxor	xmm2,xmm1
+	movdqa	xmm1,[edx]
+	pxor	xmm2,xmm0
+	movdqa	xmm0,[96+ebp]
+	movq	mm1,mm4
+	paddq	xmm6,xmm2
+	movq	mm7,[edx-32]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[esp],mm4
+	paddq	xmm0,xmm6
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[32+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[24+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[56+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[40+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[esp]
+	paddq	mm2,mm6
+	movq	mm6,[8+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-24]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[56+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[24+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[16+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[48+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[32+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[56+esp]
+	paddq	mm0,mm6
+	movq	mm6,[esp]
+	movdqa	[edx-32],xmm0
+	movdqa	xmm2,xmm4
+	movdqa	xmm0,xmm1
+db	102,15,58,15,207,8
+	movdqa	[48+edx],xmm3
+db	102,15,58,15,211,8
+	movdqa	xmm3,xmm1
+	psrlq	xmm1,7
+	paddq	xmm7,xmm2
+	movdqa	xmm2,xmm3
+	psrlq	xmm3,1
+	psllq	xmm2,56
+	pxor	xmm1,xmm3
+	psrlq	xmm3,7
+	pxor	xmm1,xmm2
+	psllq	xmm2,7
+	pxor	xmm1,xmm3
+	movdqa	xmm3,xmm6
+	pxor	xmm1,xmm2
+	movdqa	xmm2,xmm6
+	psrlq	xmm3,6
+	paddq	xmm7,xmm1
+	movdqa	xmm1,xmm6
+	psrlq	xmm2,19
+	psllq	xmm1,3
+	pxor	xmm3,xmm2
+	psrlq	xmm2,42
+	pxor	xmm3,xmm1
+	psllq	xmm1,42
+	pxor	xmm3,xmm2
+	movdqa	xmm2,[16+edx]
+	pxor	xmm3,xmm1
+	movdqa	xmm1,[112+ebp]
+	movq	mm1,mm4
+	paddq	xmm7,xmm3
+	movq	mm7,[edx-16]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[48+esp],mm4
+	paddq	xmm1,xmm7
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[16+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[8+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[40+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[24+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[48+esp]
+	paddq	mm2,mm6
+	movq	mm6,[56+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-8]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[40+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[8+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[32+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[16+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[40+esp]
+	paddq	mm0,mm6
+	movq	mm6,[48+esp]
+	movdqa	[edx-16],xmm1
+	lea	ebp,[128+ebp]
+	dec	ecx
+	jnz	NEAR L$00800_47_ssse3
+	movdqa	xmm1,[ebp]
+	lea	ebp,[ebp-640]
+	movdqu	xmm0,[ebx]
+db	102,15,56,0,193
+	movdqa	xmm3,[ebp]
+	movdqa	xmm2,xmm1
+	movdqu	xmm1,[16+ebx]
+	paddq	xmm3,xmm0
+db	102,15,56,0,202
+	movq	mm1,mm4
+	movq	mm7,[edx-128]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[32+esp]
+	paddq	mm2,mm6
+	movq	mm6,[40+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-120]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[24+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[56+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[48+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[16+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[24+esp]
+	paddq	mm0,mm6
+	movq	mm6,[32+esp]
+	movdqa	[edx-128],xmm3
+	movdqa	xmm4,[16+ebp]
+	movdqa	xmm3,xmm2
+	movdqu	xmm2,[32+ebx]
+	paddq	xmm4,xmm1
+db	102,15,56,0,211
+	movq	mm1,mm4
+	movq	mm7,[edx-112]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[16+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[48+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[40+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[8+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[56+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[16+esp]
+	paddq	mm2,mm6
+	movq	mm6,[24+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-104]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[8+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[40+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[32+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[48+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[8+esp]
+	paddq	mm0,mm6
+	movq	mm6,[16+esp]
+	movdqa	[edx-112],xmm4
+	movdqa	xmm5,[32+ebp]
+	movdqa	xmm4,xmm3
+	movdqu	xmm3,[48+ebx]
+	paddq	xmm5,xmm2
+db	102,15,56,0,220
+	movq	mm1,mm4
+	movq	mm7,[edx-96]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[32+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[24+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[56+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[40+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[esp]
+	paddq	mm2,mm6
+	movq	mm6,[8+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-88]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[56+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[24+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[16+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[48+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[32+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[56+esp]
+	paddq	mm0,mm6
+	movq	mm6,[esp]
+	movdqa	[edx-96],xmm5
+	movdqa	xmm6,[48+ebp]
+	movdqa	xmm5,xmm4
+	movdqu	xmm4,[64+ebx]
+	paddq	xmm6,xmm3
+db	102,15,56,0,229
+	movq	mm1,mm4
+	movq	mm7,[edx-80]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[48+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[16+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[8+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[40+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[24+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[48+esp]
+	paddq	mm2,mm6
+	movq	mm6,[56+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-72]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[40+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[8+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[32+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[16+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[40+esp]
+	paddq	mm0,mm6
+	movq	mm6,[48+esp]
+	movdqa	[edx-80],xmm6
+	movdqa	xmm7,[64+ebp]
+	movdqa	xmm6,xmm5
+	movdqu	xmm5,[80+ebx]
+	paddq	xmm7,xmm4
+db	102,15,56,0,238
+	movq	mm1,mm4
+	movq	mm7,[edx-64]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[32+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[56+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[24+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[8+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[32+esp]
+	paddq	mm2,mm6
+	movq	mm6,[40+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-56]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[24+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[56+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[48+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[16+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[24+esp]
+	paddq	mm0,mm6
+	movq	mm6,[32+esp]
+	movdqa	[edx-64],xmm7
+	movdqa	[edx],xmm0
+	movdqa	xmm0,[80+ebp]
+	movdqa	xmm7,xmm6
+	movdqu	xmm6,[96+ebx]
+	paddq	xmm0,xmm5
+db	102,15,56,0,247
+	movq	mm1,mm4
+	movq	mm7,[edx-48]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[16+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[48+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[40+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[8+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[56+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[16+esp]
+	paddq	mm2,mm6
+	movq	mm6,[24+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-40]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[8+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[40+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[32+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[48+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[8+esp]
+	paddq	mm0,mm6
+	movq	mm6,[16+esp]
+	movdqa	[edx-48],xmm0
+	movdqa	[16+edx],xmm1
+	movdqa	xmm1,[96+ebp]
+	movdqa	xmm0,xmm7
+	movdqu	xmm7,[112+ebx]
+	paddq	xmm1,xmm6
+db	102,15,56,0,248
+	movq	mm1,mm4
+	movq	mm7,[edx-32]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[32+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[24+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[56+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[40+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[esp]
+	paddq	mm2,mm6
+	movq	mm6,[8+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-24]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[56+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[24+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[16+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[48+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[32+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[56+esp]
+	paddq	mm0,mm6
+	movq	mm6,[esp]
+	movdqa	[edx-32],xmm1
+	movdqa	[32+edx],xmm2
+	movdqa	xmm2,[112+ebp]
+	movdqa	xmm0,[edx]
+	paddq	xmm2,xmm7
+	movq	mm1,mm4
+	movq	mm7,[edx-16]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[48+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm0,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[16+esp],mm0
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[8+esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[40+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm0
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm0
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[24+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm2,mm0
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	pxor	mm6,mm7
+	movq	mm5,[48+esp]
+	paddq	mm2,mm6
+	movq	mm6,[56+esp]
+	movq	mm1,mm4
+	movq	mm7,[edx-8]
+	pxor	mm5,mm6
+	psrlq	mm1,14
+	movq	[40+esp],mm4
+	pand	mm5,mm4
+	psllq	mm4,23
+	paddq	mm2,mm3
+	movq	mm3,mm1
+	psrlq	mm1,4
+	pxor	mm5,mm6
+	pxor	mm3,mm4
+	psllq	mm4,23
+	pxor	mm3,mm1
+	movq	[8+esp],mm2
+	paddq	mm7,mm5
+	pxor	mm3,mm4
+	psrlq	mm1,23
+	paddq	mm7,[esp]
+	pxor	mm3,mm1
+	psllq	mm4,4
+	pxor	mm3,mm4
+	movq	mm4,[32+esp]
+	paddq	mm3,mm7
+	movq	mm5,mm2
+	psrlq	mm5,28
+	paddq	mm4,mm3
+	movq	mm6,mm2
+	movq	mm7,mm5
+	psllq	mm6,25
+	movq	mm1,[16+esp]
+	psrlq	mm5,6
+	pxor	mm7,mm6
+	psllq	mm6,5
+	pxor	mm7,mm5
+	pxor	mm2,mm1
+	psrlq	mm5,5
+	pxor	mm7,mm6
+	pand	mm0,mm2
+	psllq	mm6,6
+	pxor	mm7,mm5
+	pxor	mm0,mm1
+	pxor	mm6,mm7
+	movq	mm5,[40+esp]
+	paddq	mm0,mm6
+	movq	mm6,[48+esp]
+	movdqa	[edx-16],xmm2
+	movq	mm1,[8+esp]
+	paddq	mm0,mm3
+	movq	mm3,[24+esp]
+	movq	mm7,[56+esp]
+	pxor	mm2,mm1
+	paddq	mm0,[esi]
+	paddq	mm1,[8+esi]
+	paddq	mm2,[16+esi]
+	paddq	mm3,[24+esi]
+	paddq	mm4,[32+esi]
+	paddq	mm5,[40+esi]
+	paddq	mm6,[48+esi]
+	paddq	mm7,[56+esi]
+	movq	[esi],mm0
+	movq	[8+esi],mm1
+	movq	[16+esi],mm2
+	movq	[24+esi],mm3
+	movq	[32+esi],mm4
+	movq	[40+esi],mm5
+	movq	[48+esi],mm6
+	movq	[56+esi],mm7
+	cmp	edi,eax
+	jb	NEAR L$007loop_ssse3
+	mov	esp,DWORD [76+edx]
+	emms
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+L$002loop_x86:
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [16+edi]
+	mov	ebx,DWORD [20+edi]
+	mov	ecx,DWORD [24+edi]
+	mov	edx,DWORD [28+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [32+edi]
+	mov	ebx,DWORD [36+edi]
+	mov	ecx,DWORD [40+edi]
+	mov	edx,DWORD [44+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [48+edi]
+	mov	ebx,DWORD [52+edi]
+	mov	ecx,DWORD [56+edi]
+	mov	edx,DWORD [60+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [64+edi]
+	mov	ebx,DWORD [68+edi]
+	mov	ecx,DWORD [72+edi]
+	mov	edx,DWORD [76+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [80+edi]
+	mov	ebx,DWORD [84+edi]
+	mov	ecx,DWORD [88+edi]
+	mov	edx,DWORD [92+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [96+edi]
+	mov	ebx,DWORD [100+edi]
+	mov	ecx,DWORD [104+edi]
+	mov	edx,DWORD [108+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	mov	eax,DWORD [112+edi]
+	mov	ebx,DWORD [116+edi]
+	mov	ecx,DWORD [120+edi]
+	mov	edx,DWORD [124+edi]
+	bswap	eax
+	bswap	ebx
+	bswap	ecx
+	bswap	edx
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	add	edi,128
+	sub	esp,72
+	mov	DWORD [204+esp],edi
+	lea	edi,[8+esp]
+	mov	ecx,16
+dd	2784229001
+align	16
+L$00900_15_x86:
+	mov	ecx,DWORD [40+esp]
+	mov	edx,DWORD [44+esp]
+	mov	esi,ecx
+	shr	ecx,9
+	mov	edi,edx
+	shr	edx,9
+	mov	ebx,ecx
+	shl	esi,14
+	mov	eax,edx
+	shl	edi,14
+	xor	ebx,esi
+	shr	ecx,5
+	xor	eax,edi
+	shr	edx,5
+	xor	eax,ecx
+	shl	esi,4
+	xor	ebx,edx
+	shl	edi,4
+	xor	ebx,esi
+	shr	ecx,4
+	xor	eax,edi
+	shr	edx,4
+	xor	eax,ecx
+	shl	esi,5
+	xor	ebx,edx
+	shl	edi,5
+	xor	eax,esi
+	xor	ebx,edi
+	mov	ecx,DWORD [48+esp]
+	mov	edx,DWORD [52+esp]
+	mov	esi,DWORD [56+esp]
+	mov	edi,DWORD [60+esp]
+	add	eax,DWORD [64+esp]
+	adc	ebx,DWORD [68+esp]
+	xor	ecx,esi
+	xor	edx,edi
+	and	ecx,DWORD [40+esp]
+	and	edx,DWORD [44+esp]
+	add	eax,DWORD [192+esp]
+	adc	ebx,DWORD [196+esp]
+	xor	ecx,esi
+	xor	edx,edi
+	mov	esi,DWORD [ebp]
+	mov	edi,DWORD [4+ebp]
+	add	eax,ecx
+	adc	ebx,edx
+	mov	ecx,DWORD [32+esp]
+	mov	edx,DWORD [36+esp]
+	add	eax,esi
+	adc	ebx,edi
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	add	eax,ecx
+	adc	ebx,edx
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	mov	DWORD [32+esp],eax
+	mov	DWORD [36+esp],ebx
+	mov	esi,ecx
+	shr	ecx,2
+	mov	edi,edx
+	shr	edx,2
+	mov	ebx,ecx
+	shl	esi,4
+	mov	eax,edx
+	shl	edi,4
+	xor	ebx,esi
+	shr	ecx,5
+	xor	eax,edi
+	shr	edx,5
+	xor	ebx,ecx
+	shl	esi,21
+	xor	eax,edx
+	shl	edi,21
+	xor	eax,esi
+	shr	ecx,21
+	xor	ebx,edi
+	shr	edx,21
+	xor	eax,ecx
+	shl	esi,5
+	xor	ebx,edx
+	shl	edi,5
+	xor	eax,esi
+	xor	ebx,edi
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	mov	esi,DWORD [16+esp]
+	mov	edi,DWORD [20+esp]
+	add	eax,DWORD [esp]
+	adc	ebx,DWORD [4+esp]
+	or	ecx,esi
+	or	edx,edi
+	and	ecx,DWORD [24+esp]
+	and	edx,DWORD [28+esp]
+	and	esi,DWORD [8+esp]
+	and	edi,DWORD [12+esp]
+	or	ecx,esi
+	or	edx,edi
+	add	eax,ecx
+	adc	ebx,edx
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	mov	dl,BYTE [ebp]
+	sub	esp,8
+	lea	ebp,[8+ebp]
+	cmp	dl,148
+	jne	NEAR L$00900_15_x86
+align	16
+L$01016_79_x86:
+	mov	ecx,DWORD [312+esp]
+	mov	edx,DWORD [316+esp]
+	mov	esi,ecx
+	shr	ecx,1
+	mov	edi,edx
+	shr	edx,1
+	mov	eax,ecx
+	shl	esi,24
+	mov	ebx,edx
+	shl	edi,24
+	xor	ebx,esi
+	shr	ecx,6
+	xor	eax,edi
+	shr	edx,6
+	xor	eax,ecx
+	shl	esi,7
+	xor	ebx,edx
+	shl	edi,1
+	xor	ebx,esi
+	shr	ecx,1
+	xor	eax,edi
+	shr	edx,1
+	xor	eax,ecx
+	shl	edi,6
+	xor	ebx,edx
+	xor	eax,edi
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	mov	ecx,DWORD [208+esp]
+	mov	edx,DWORD [212+esp]
+	mov	esi,ecx
+	shr	ecx,6
+	mov	edi,edx
+	shr	edx,6
+	mov	eax,ecx
+	shl	esi,3
+	mov	ebx,edx
+	shl	edi,3
+	xor	eax,esi
+	shr	ecx,13
+	xor	ebx,edi
+	shr	edx,13
+	xor	eax,ecx
+	shl	esi,10
+	xor	ebx,edx
+	shl	edi,10
+	xor	ebx,esi
+	shr	ecx,10
+	xor	eax,edi
+	shr	edx,10
+	xor	ebx,ecx
+	shl	edi,13
+	xor	eax,edx
+	xor	eax,edi
+	mov	ecx,DWORD [320+esp]
+	mov	edx,DWORD [324+esp]
+	add	eax,DWORD [esp]
+	adc	ebx,DWORD [4+esp]
+	mov	esi,DWORD [248+esp]
+	mov	edi,DWORD [252+esp]
+	add	eax,ecx
+	adc	ebx,edx
+	add	eax,esi
+	adc	ebx,edi
+	mov	DWORD [192+esp],eax
+	mov	DWORD [196+esp],ebx
+	mov	ecx,DWORD [40+esp]
+	mov	edx,DWORD [44+esp]
+	mov	esi,ecx
+	shr	ecx,9
+	mov	edi,edx
+	shr	edx,9
+	mov	ebx,ecx
+	shl	esi,14
+	mov	eax,edx
+	shl	edi,14
+	xor	ebx,esi
+	shr	ecx,5
+	xor	eax,edi
+	shr	edx,5
+	xor	eax,ecx
+	shl	esi,4
+	xor	ebx,edx
+	shl	edi,4
+	xor	ebx,esi
+	shr	ecx,4
+	xor	eax,edi
+	shr	edx,4
+	xor	eax,ecx
+	shl	esi,5
+	xor	ebx,edx
+	shl	edi,5
+	xor	eax,esi
+	xor	ebx,edi
+	mov	ecx,DWORD [48+esp]
+	mov	edx,DWORD [52+esp]
+	mov	esi,DWORD [56+esp]
+	mov	edi,DWORD [60+esp]
+	add	eax,DWORD [64+esp]
+	adc	ebx,DWORD [68+esp]
+	xor	ecx,esi
+	xor	edx,edi
+	and	ecx,DWORD [40+esp]
+	and	edx,DWORD [44+esp]
+	add	eax,DWORD [192+esp]
+	adc	ebx,DWORD [196+esp]
+	xor	ecx,esi
+	xor	edx,edi
+	mov	esi,DWORD [ebp]
+	mov	edi,DWORD [4+ebp]
+	add	eax,ecx
+	adc	ebx,edx
+	mov	ecx,DWORD [32+esp]
+	mov	edx,DWORD [36+esp]
+	add	eax,esi
+	adc	ebx,edi
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	add	eax,ecx
+	adc	ebx,edx
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	mov	DWORD [32+esp],eax
+	mov	DWORD [36+esp],ebx
+	mov	esi,ecx
+	shr	ecx,2
+	mov	edi,edx
+	shr	edx,2
+	mov	ebx,ecx
+	shl	esi,4
+	mov	eax,edx
+	shl	edi,4
+	xor	ebx,esi
+	shr	ecx,5
+	xor	eax,edi
+	shr	edx,5
+	xor	ebx,ecx
+	shl	esi,21
+	xor	eax,edx
+	shl	edi,21
+	xor	eax,esi
+	shr	ecx,21
+	xor	ebx,edi
+	shr	edx,21
+	xor	eax,ecx
+	shl	esi,5
+	xor	ebx,edx
+	shl	edi,5
+	xor	eax,esi
+	xor	ebx,edi
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	mov	esi,DWORD [16+esp]
+	mov	edi,DWORD [20+esp]
+	add	eax,DWORD [esp]
+	adc	ebx,DWORD [4+esp]
+	or	ecx,esi
+	or	edx,edi
+	and	ecx,DWORD [24+esp]
+	and	edx,DWORD [28+esp]
+	and	esi,DWORD [8+esp]
+	and	edi,DWORD [12+esp]
+	or	ecx,esi
+	or	edx,edi
+	add	eax,ecx
+	adc	ebx,edx
+	mov	DWORD [esp],eax
+	mov	DWORD [4+esp],ebx
+	mov	dl,BYTE [ebp]
+	sub	esp,8
+	lea	ebp,[8+ebp]
+	cmp	dl,23
+	jne	NEAR L$01016_79_x86
+	mov	esi,DWORD [840+esp]
+	mov	edi,DWORD [844+esp]
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	add	eax,DWORD [8+esp]
+	adc	ebx,DWORD [12+esp]
+	mov	DWORD [esi],eax
+	mov	DWORD [4+esi],ebx
+	add	ecx,DWORD [16+esp]
+	adc	edx,DWORD [20+esp]
+	mov	DWORD [8+esi],ecx
+	mov	DWORD [12+esi],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	add	eax,DWORD [24+esp]
+	adc	ebx,DWORD [28+esp]
+	mov	DWORD [16+esi],eax
+	mov	DWORD [20+esi],ebx
+	add	ecx,DWORD [32+esp]
+	adc	edx,DWORD [36+esp]
+	mov	DWORD [24+esi],ecx
+	mov	DWORD [28+esi],edx
+	mov	eax,DWORD [32+esi]
+	mov	ebx,DWORD [36+esi]
+	mov	ecx,DWORD [40+esi]
+	mov	edx,DWORD [44+esi]
+	add	eax,DWORD [40+esp]
+	adc	ebx,DWORD [44+esp]
+	mov	DWORD [32+esi],eax
+	mov	DWORD [36+esi],ebx
+	add	ecx,DWORD [48+esp]
+	adc	edx,DWORD [52+esp]
+	mov	DWORD [40+esi],ecx
+	mov	DWORD [44+esi],edx
+	mov	eax,DWORD [48+esi]
+	mov	ebx,DWORD [52+esi]
+	mov	ecx,DWORD [56+esi]
+	mov	edx,DWORD [60+esi]
+	add	eax,DWORD [56+esp]
+	adc	ebx,DWORD [60+esp]
+	mov	DWORD [48+esi],eax
+	mov	DWORD [52+esi],ebx
+	add	ecx,DWORD [64+esp]
+	adc	edx,DWORD [68+esp]
+	mov	DWORD [56+esi],ecx
+	mov	DWORD [60+esi],edx
+	add	esp,840
+	sub	ebp,640
+	cmp	edi,DWORD [8+esp]
+	jb	NEAR L$002loop_x86
+	mov	esp,DWORD [12+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$001K512:
+dd	3609767458,1116352408
+dd	602891725,1899447441
+dd	3964484399,3049323471
+dd	2173295548,3921009573
+dd	4081628472,961987163
+dd	3053834265,1508970993
+dd	2937671579,2453635748
+dd	3664609560,2870763221
+dd	2734883394,3624381080
+dd	1164996542,310598401
+dd	1323610764,607225278
+dd	3590304994,1426881987
+dd	4068182383,1925078388
+dd	991336113,2162078206
+dd	633803317,2614888103
+dd	3479774868,3248222580
+dd	2666613458,3835390401
+dd	944711139,4022224774
+dd	2341262773,264347078
+dd	2007800933,604807628
+dd	1495990901,770255983
+dd	1856431235,1249150122
+dd	3175218132,1555081692
+dd	2198950837,1996064986
+dd	3999719339,2554220882
+dd	766784016,2821834349
+dd	2566594879,2952996808
+dd	3203337956,3210313671
+dd	1034457026,3336571891
+dd	2466948901,3584528711
+dd	3758326383,113926993
+dd	168717936,338241895
+dd	1188179964,666307205
+dd	1546045734,773529912
+dd	1522805485,1294757372
+dd	2643833823,1396182291
+dd	2343527390,1695183700
+dd	1014477480,1986661051
+dd	1206759142,2177026350
+dd	344077627,2456956037
+dd	1290863460,2730485921
+dd	3158454273,2820302411
+dd	3505952657,3259730800
+dd	106217008,3345764771
+dd	3606008344,3516065817
+dd	1432725776,3600352804
+dd	1467031594,4094571909
+dd	851169720,275423344
+dd	3100823752,430227734
+dd	1363258195,506948616
+dd	3750685593,659060556
+dd	3785050280,883997877
+dd	3318307427,958139571
+dd	3812723403,1322822218
+dd	2003034995,1537002063
+dd	3602036899,1747873779
+dd	1575990012,1955562222
+dd	1125592928,2024104815
+dd	2716904306,2227730452
+dd	442776044,2361852424
+dd	593698344,2428436474
+dd	3733110249,2756734187
+dd	2999351573,3204031479
+dd	3815920427,3329325298
+dd	3928383900,3391569614
+dd	566280711,3515267271
+dd	3454069534,3940187606
+dd	4000239992,4118630271
+dd	1914138554,116418474
+dd	2731055270,174292421
+dd	3203993006,289380356
+dd	320620315,460393269
+dd	587496836,685471733
+dd	1086792851,852142971
+dd	365543100,1017036298
+dd	2618297676,1126000580
+dd	3409855158,1288033470
+dd	4234509866,1501505948
+dd	987167468,1607167915
+dd	1246189591,1816402316
+dd	67438087,66051
+dd	202182159,134810123
+db	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+db	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+db	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+db	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+db	62,0
+segment	.bss
+common	_OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha512-armv4-linux.S b/gen/bcm/sha512-armv4-linux.S
new file mode 100644
index 0000000..5500686
--- /dev/null
+++ b/gen/bcm/sha512-armv4-linux.S
@@ -0,0 +1,1855 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
+	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
+	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
+	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
+	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
+	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
+	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
+	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
+	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
+	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
+	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
+	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
+	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
+	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
+	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
+	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
+	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
+	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
+	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
+	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
+	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
+	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
+	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
+	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
+	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
+	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
+	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
+	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
+	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
+	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
+	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
+	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
+	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
+	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
+	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
+	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
+	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
+	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
+	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
+	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
+.size	K512,.-K512
+
+.globl	sha512_block_data_order_nohw
+.hidden	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	adr	r14,K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+.Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	.L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+#if __ARM_ARCH>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	.L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	sha512_block_data_order_neon
+.hidden	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+	dmb	@ errata #451034 on early Cortex A8
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	adr	r3,K512
+	VFP_ABI_PUSH
+	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64	{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 0>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 1>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64	{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 2>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 3>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64	{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 4>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 5>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64	{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 6>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 7>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64	{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 8>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 9>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64	{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 10>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 11>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64	{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 12>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 13>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64	{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 14>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 15>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	mov	r12,#4
+.L16_79_neon:
+	subs	r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q7,#6
+	vsli.64	q12,q7,#45
+	vext.8	q14,q0,q1,#8	@ X[i+1]
+	vsli.64	q13,q7,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q4,q5,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 17>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q0,#6
+	vsli.64	q12,q0,#45
+	vext.8	q14,q1,q2,#8	@ X[i+1]
+	vsli.64	q13,q0,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q5,q6,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 19>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q1,#6
+	vsli.64	q12,q1,#45
+	vext.8	q14,q2,q3,#8	@ X[i+1]
+	vsli.64	q13,q1,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q6,q7,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 21>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q2,#6
+	vsli.64	q12,q2,#45
+	vext.8	q14,q3,q4,#8	@ X[i+1]
+	vsli.64	q13,q2,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q7,q0,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 23>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q3,#6
+	vsli.64	q12,q3,#45
+	vext.8	q14,q4,q5,#8	@ X[i+1]
+	vsli.64	q13,q3,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q0,q1,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 25>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q4,#6
+	vsli.64	q12,q4,#45
+	vext.8	q14,q5,q6,#8	@ X[i+1]
+	vsli.64	q13,q4,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q1,q2,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 27>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q5,#6
+	vsli.64	q12,q5,#45
+	vext.8	q14,q6,q7,#8	@ X[i+1]
+	vsli.64	q13,q5,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q2,q3,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 29>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q6,#6
+	vsli.64	q12,q6,#45
+	vext.8	q14,q7,q0,#8	@ X[i+1]
+	vsli.64	q13,q6,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q3,q4,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 31>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	bne	.L16_79_neon
+
+	vadd.i64	d16,d30		@ h+=Maj from the past
+	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
+	teq	r1,r2
+	sub	r3,#640	@ rewind K512
+	bne	.Loop_neon
+
+	VFP_ABI_POP
+	bx	lr				@ .word	0xe12fff1e
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha512-armv8-apple.S b/gen/bcm/sha512-armv8-apple.S
new file mode 100644
index 0000000..8c98e06
--- /dev/null
+++ b/gen/bcm/sha512-armv8-apple.S
@@ -0,0 +1,1596 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	_sha512_block_data_order_nohw
+.private_extern	_sha512_block_data_order_nohw
+
+.align	6
+_sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512@PAGE
+	add	x30,x30,LK512@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	_sha512_block_data_order_hw
+.private_extern	_sha512_block_data_order_hw
+
+.align	6
+_sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512@PAGE
+	add	x3,x3,LK512@PAGEOFF
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha512-armv8-linux.S b/gen/bcm/sha512-armv8-linux.S
new file mode 100644
index 0000000..fd15987
--- /dev/null
+++ b/gen/bcm/sha512-armv8-linux.S
@@ -0,0 +1,1596 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	sha512_block_data_order_nohw
+.hidden	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+.align	6
+sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,.LK512
+	add	x30,x30,:lo12:.LK512
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+.Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+
+.section	.rodata
+.align	6
+.type	.LK512,%object
+.LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+.size	.LK512,.-.LK512
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+.hidden	sha512_block_data_order_hw
+.type	sha512_block_data_order_hw,%function
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,.LK512
+	add	x3,x3,:lo12:.LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha512-armv8-win.S b/gen/bcm/sha512-armv8-win.S
new file mode 100644
index 0000000..220f489
--- /dev/null
+++ b/gen/bcm/sha512-armv8-win.S
@@ -0,0 +1,1600 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	sha512_block_data_order_nohw
+
+.def sha512_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512
+	add	x30,x30,:lo12:LK512
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+
+.def sha512_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512
+	add	x3,x3,:lo12:LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha512-x86_64-apple.S b/gen/bcm/sha512-x86_64-apple.S
new file mode 100644
index 0000000..58f27a4
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-apple.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_sha512_block_data_order_nohw
+.private_extern _sha512_block_data_order_nohw
+
+.p2align	4
+_sha512_block_data_order_nohw:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+
+L$prologue:
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rax
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	jmp	L$rounds_16_xx
+.p2align	4
+L$rounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	L$rounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	L$loop
+
+	movq	152(%rsp),%rsi
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue:
+	ret
+
+
+.section	__DATA,__const
+.p2align	6
+
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_sha512_block_data_order_avx
+.private_extern _sha512_block_data_order_avx
+
+.p2align	6
+_sha512_block_data_order_avx:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+
+L$prologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	L$loop_avx
+.p2align	4
+L$loop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	L$avx_00_47
+
+.p2align	4
+L$avx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	L$avx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	L$loop_avx
+
+	movq	152(%rsp),%rsi
+
+	vzeroupper
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$epilogue_avx:
+	ret
+
+
+#endif
diff --git a/gen/bcm/sha512-x86_64-linux.S b/gen/bcm/sha512-x86_64-linux.S
new file mode 100644
index 0000000..bbef943
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-linux.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	sha512_block_data_order_nohw
+.hidden sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,@function
+.align	16
+sha512_block_data_order_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rax
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	ret
+.cfi_endproc	
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+.section	.rodata
+.align	64
+.type	K512,@object
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	sha512_block_data_order_avx
+.hidden sha512_block_data_order_avx
+.type	sha512_block_data_order_avx,@function
+.align	64
+sha512_block_data_order_avx:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	.Lavx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop_avx
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	ret
+.cfi_endproc	
+.size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
+#endif
diff --git a/gen/bcm/sha512-x86_64-win.asm b/gen/bcm/sha512-x86_64-win.asm
new file mode 100644
index 0000000..3b02e03
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-win.asm
@@ -0,0 +1,3140 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+global	sha512_block_data_order_nohw
+
+ALIGN	16
+sha512_block_data_order_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha512_block_data_order_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*8+4*8
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+$L$prologue:
+
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	rdi,rbx
+	lea	rbp,[K512]
+	xor	rdi,rcx
+	mov	r12,QWORD[rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[8+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[16+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[24+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[32+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[40+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[48+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[56+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	add	rax,r14
+	mov	r12,QWORD[64+rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[72+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[80+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[88+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[96+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[104+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[112+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[120+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13,QWORD[8+rsp]
+	mov	r15,QWORD[112+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[72+rsp]
+
+	add	r12,QWORD[rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[16+rsp]
+	mov	rdi,QWORD[120+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[80+rsp]
+
+	add	r12,QWORD[8+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[24+rsp]
+	mov	r15,QWORD[rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[88+rsp]
+
+	add	r12,QWORD[16+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[32+rsp]
+	mov	rdi,QWORD[8+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[96+rsp]
+
+	add	r12,QWORD[24+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[40+rsp]
+	mov	r15,QWORD[16+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[104+rsp]
+
+	add	r12,QWORD[32+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[48+rsp]
+	mov	rdi,QWORD[24+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[112+rsp]
+
+	add	r12,QWORD[40+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[56+rsp]
+	mov	r15,QWORD[32+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[120+rsp]
+
+	add	r12,QWORD[48+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[64+rsp]
+	mov	rdi,QWORD[40+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[rsp]
+
+	add	r12,QWORD[56+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[72+rsp]
+	mov	r15,QWORD[48+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[8+rsp]
+
+	add	r12,QWORD[64+rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[80+rsp]
+	mov	rdi,QWORD[56+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[16+rsp]
+
+	add	r12,QWORD[72+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[88+rsp]
+	mov	r15,QWORD[64+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[24+rsp]
+
+	add	r12,QWORD[80+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[96+rsp]
+	mov	rdi,QWORD[72+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[32+rsp]
+
+	add	r12,QWORD[88+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[104+rsp]
+	mov	r15,QWORD[80+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[40+rsp]
+
+	add	r12,QWORD[96+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[112+rsp]
+	mov	rdi,QWORD[88+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[48+rsp]
+
+	add	r12,QWORD[104+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[120+rsp]
+	mov	r15,QWORD[96+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[56+rsp]
+
+	add	r12,QWORD[112+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[rsp]
+	mov	rdi,QWORD[104+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[64+rsp]
+
+	add	r12,QWORD[120+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	cmp	BYTE[7+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((128+0))+rsp]
+	add	rax,r14
+	lea	rsi,[128+rsi]
+
+	add	rax,QWORD[rdi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[152+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha512_block_data_order_nohw:
+section	.rdata rdata align=8
+ALIGN	64
+
+K512:
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+	DB	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+	DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+	DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+	DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+	DB	111,114,103,62,0
+section	.text
+
+global	sha512_block_data_order_avx
+
+ALIGN	64
+sha512_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,256
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+	movaps	XMMWORD[(128+32)+rsp],xmm6
+	movaps	XMMWORD[(128+48)+rsp],xmm7
+	movaps	XMMWORD[(128+64)+rsp],xmm8
+	movaps	XMMWORD[(128+80)+rsp],xmm9
+	movaps	XMMWORD[(128+96)+rsp],xmm10
+	movaps	XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+	vzeroupper
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm11,XMMWORD[((K512+1280))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	lea	rbp,[((K512+128))]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vpshufb	xmm0,xmm0,xmm11
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm1,xmm1,xmm11
+	vmovdqu	xmm4,XMMWORD[64+rsi]
+	vpshufb	xmm2,xmm2,xmm11
+	vmovdqu	xmm5,XMMWORD[80+rsi]
+	vpshufb	xmm3,xmm3,xmm11
+	vmovdqu	xmm6,XMMWORD[96+rsi]
+	vpshufb	xmm4,xmm4,xmm11
+	vmovdqu	xmm7,XMMWORD[112+rsi]
+	vpshufb	xmm5,xmm5,xmm11
+	vpaddq	xmm8,xmm0,XMMWORD[((-128))+rbp]
+	vpshufb	xmm6,xmm6,xmm11
+	vpaddq	xmm9,xmm1,XMMWORD[((-96))+rbp]
+	vpshufb	xmm7,xmm7,xmm11
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	vpaddq	xmm11,xmm3,XMMWORD[((-32))+rbp]
+	vmovdqa	XMMWORD[rsp],xmm8
+	vpaddq	xmm8,xmm4,XMMWORD[rbp]
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	vpaddq	xmm9,xmm5,XMMWORD[32+rbp]
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	vmovdqa	XMMWORD[48+rsp],xmm11
+	vpaddq	xmm11,xmm7,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[64+rsp],xmm8
+	mov	r14,rax
+	vmovdqa	XMMWORD[80+rsp],xmm9
+	mov	rdi,rbx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	xor	rdi,rcx
+	vmovdqa	XMMWORD[112+rsp],xmm11
+	mov	r13,r8
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	add	rbp,256
+	vpalignr	xmm8,xmm1,xmm0,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm5,xmm4,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm0,xmm0,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm7,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm7,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm0,xmm0,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm7,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm0,xmm0,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm0,XMMWORD[((-128))+rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[rsp],xmm10
+	vpalignr	xmm8,xmm2,xmm1,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm6,xmm5,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm1,xmm1,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm0,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm0,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm1,xmm1,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm0,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm1,xmm1,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm1,XMMWORD[((-96))+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[16+rsp],xmm10
+	vpalignr	xmm8,xmm3,xmm2,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm7,xmm6,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm2,xmm2,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm1,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm1,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm2,xmm2,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm1,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm2,xmm2,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpalignr	xmm8,xmm4,xmm3,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm0,xmm7,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm3,xmm3,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm2,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm2,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm3,xmm3,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm2,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm3,xmm3,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm3,XMMWORD[((-32))+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[48+rsp],xmm10
+	vpalignr	xmm8,xmm5,xmm4,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm1,xmm0,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm4,xmm4,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm3,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm3,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm4,xmm4,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm3,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm4,xmm4,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm4,XMMWORD[rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[64+rsp],xmm10
+	vpalignr	xmm8,xmm6,xmm5,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm2,xmm1,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm5,xmm5,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm4,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm4,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm5,xmm5,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm4,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm5,xmm5,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm5,XMMWORD[32+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[80+rsp],xmm10
+	vpalignr	xmm8,xmm7,xmm6,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm3,xmm2,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm6,xmm6,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm5,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm5,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm6,xmm6,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm5,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm6,xmm6,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	vpalignr	xmm8,xmm0,xmm7,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm4,xmm3,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm7,xmm7,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm6,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm6,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm7,xmm7,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm6,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm7,xmm7,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm7,XMMWORD[96+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[112+rsp],xmm10
+	cmp	BYTE[135+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	mov	rdi,QWORD[((128+0))+rsp]
+	mov	rax,r14
+
+	add	rax,QWORD[rdi]
+	lea	rsi,[128+rsi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[152+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((128+32))+rsp]
+	movaps	xmm7,XMMWORD[((128+48))+rsp]
+	movaps	xmm8,XMMWORD[((128+64))+rsp]
+	movaps	xmm9,XMMWORD[((128+80))+rsp]
+	movaps	xmm10,XMMWORD[((128+96))+rsp]
+	movaps	xmm11,XMMWORD[((128+112))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_sha512_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((128+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((128+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,12
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase
+	DD	$L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_sha512_block_data_order_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/vpaes-armv7-linux.S b/gen/bcm/vpaes-armv7-linux.S
new file mode 100644
index 0000000..6e7898a
--- /dev/null
+++ b/gen/bcm/vpaes-armv7-linux.S
@@ -0,0 +1,1225 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+.syntax	unified
+
+.arch	armv7-a
+.fpu	neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	_vpaes_consts,%object
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:@ mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:@ sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:@ inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:@ input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:@ sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:@ sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:@ sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+.type	_vpaes_preheat,%function
+.align	4
+_vpaes_preheat:
+	adr	r10, .Lk_inv
+	vmov.i8	q9, #0x0f		@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [r2] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+	adr	r11, .Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, .Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	d3, {q2}, d3
+	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	d5, {q3}, d1
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	d9, {q13}, d5
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	d1, {q12}, d7
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	d11, {q15}, d5
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	d5, {q14}, d7
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	d7, {q0}, d3
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	d11, {q0}, d9
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	d9, {q3}, d3
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+.Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	d11, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	.Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, .Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	d5, {q0}, d7
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	d1, {q2}, d3
+	bx	lr
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+	@ alignment.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_encrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+@
+@  Decryption stuff
+@
+.type	_vpaes_decrypt_consts,%object
+.align	4
+_vpaes_decrypt_consts:
+.Lk_dipt:@ decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:@ decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.size	_vpaes_decrypt_consts,.-_vpaes_decrypt_consts
+
+@@
+@@  Decryption core
+@@
+@@  Same API as encryption core, except it clobbers q12-q15 rather than using
+@@  the values from _vpaes_preheat. q9-q11 must still be set from
+@@  _vpaes_preheat.
+@@
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+
+	@ This function performs shuffles with various constants. The x86_64
+	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
+	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
+	@ version preloads those constants into registers, but ARMv7 has half
+	@ the registers to work with. Instead, we load them on-demand into
+	@ q12-q15, registers normally use for preloaded constants. This is fine
+	@ because decryption doesn't use those constants. The values are
+	@ constant, so this does not interfere with potential 2x optimizations.
+	adr	r7, .Lk_dipt
+
+	vld1.64	{q12,q13}, [r7]		@ vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
+	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
+	adr	r10, .Lk_sr
+	and	r11, r11, #0x30		@ and		$0x30,	%r11
+	add	r11, r11, r10
+	adr	r10, .Lk_mc_forward+48
+
+	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q12}, d3
+	vld1.64	{q5}, [r10]		@ vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q13}, d1
+	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
+	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
+
+	@ .Ldec_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Ldec_loop.
+	tst	r8, r8
+	b	.Ldec_entry
+
+.align	4
+.Ldec_loop:
+@
+@  Inverse mix columns
+@
+
+	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+	@ the function.
+	adr	r10, .Lk_dsb9
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	vtbl.8	d9, {q12}, d5
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
+
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	@ Load sbb* ahead of time.
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
+					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	vtbl.8	d3, {q15}, d7
+					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
+					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
+
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	vtbl.8	d9, {q12}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	vtbl.8	d3, {q15}, d7
+	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
+
+.Ldec_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
+	bne	.Ldec_loop
+
+	@ middle of last round
+
+	adr	r10, .Lk_dsbo
+
+	@ Write to q1 rather than q4 to avoid overlapping table and destination.
+	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	@ Write to q2 rather than q1 to avoid overlapping table and destination.
+	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	vtbl.8	d3, {q2}, d7
+	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	@ Write to q1 rather than q0 so the table and destination registers
+	@ below do not overlap.
+	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
+	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
+	vtbl.8	d1, {q1}, d5
+	bx	lr
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	@ _vpaes_decrypt_core uses r7-r11.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_decrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_decrypt,.-vpaes_decrypt
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+.type	_vpaes_key_consts,%object
+.align	4
+_vpaes_key_consts:
+.Lk_dksd:@ decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:@ decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:@ decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:@ rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:@ output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size	_vpaes_key_consts,.-_vpaes_key_consts
+
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	r11, .Lk_rcon
+	vmov.i8	q12, #0x5b			@ .Lk_s63
+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	q9, #0x0f			@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]		@ .Lk_inv
+	vld1.64	{q8}, [r11]			@ .Lk_rcon
+	bx	lr
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+	tst	r3, r3
+	bne	.Lschedule_am_decrypting
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	@ decrypting, output zeroth round key after shiftrows
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q4}, d3
+	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	eor	r8, r8, #0x30		@ xor	$0x30, %r8
+
+.Lschedule_go:
+	cmp	r1, #192		@ cmp	$192,	%esi
+	bhi	.Lschedule_256
+	beq	.Lschedule_192
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+.Lschedule_128:
+	mov	r0, #10		@ mov	$10, %esi
+
+.Loop_schedule_128:
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1		@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b	.Loop_schedule_128
+
+@@
+@@  .aes_schedule_192
+@@
+@@  192-bit specific part of key schedule.
+@@
+@@  The main body of this schedule is the same as the 128-bit
+@@  schedule, but with more smearing.  The long, high side is
+@@  stored in q7 as before, and the short, low side is in
+@@  the high bits of q6.
+@@
+@@  This schedule is somewhat nastier, however, because each
+@@  round produces 192 bits of key material, or 1.5 round keys.
+@@  Therefore, on each cycle we do 2 rounds and produce 3 round
+@@  keys.
+@@
+.align	4
+.Lschedule_192:
+	sub	r0, r0, #8
+	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
+	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	r0, #4			@ mov	$4,	%esi
+
+.Loop_schedule_192:
+	bl	_vpaes_schedule_round
+	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		@ save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		@ save key n+1
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		@ save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+.Lschedule_256:
+	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	r0, #7			@ mov	$7, %esi
+
+.Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+.Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	tst	r3, r3
+	bne	.Lschedule_mangle_last_dec
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	r2, r2, #32		@ add		$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	d1, {q2}, d3
+
+.Lschedule_mangle_last_dec:
+	sub	r2, r2, #16			@ add	$-16,	%rdx
+	veor	q0, q0, q12			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@  .aes_schedule_192_smear
+@@
+@@  Smear the short, low side in the 192-bit key schedule.
+@@
+@@  Inputs:
+@@    q7: high side, b  a  x  y
+@@    q6:  low side, d  c  0  0
+@@
+@@  Outputs:
+@@    q6: b+c+d  b+c  0  0
+@@    q0: b+c+d  b+c  b  a
+@@
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	vmov.i8	q1, #0
+	vdup.32	q0, d15[1]
+	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
+	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	bx	lr
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, .Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q7, q7, q12			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	d7, {q10}, d7
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	d5, {q10}, d9
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	d9, {q15}, d7
+	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	d3, {q14}, d5
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d3
+	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q15}, d1
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	tst	r3, r3
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	bne	.Lschedule_mangle_dec
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, q12		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	r2, r2, #16		@ add		$16,	%rdx
+	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	d9, {q2}, d11
+	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	d3, {q4}, d11
+	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	d7, {q1}, d11
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	@ inverse mix columns
+	adr	r11, .Lk_dksd 		@ lea		.Lk_dksd(%rip),%r11
+	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
+					@ vmovdqa	0x10(%r11),	%xmm3
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dksb ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
+					@ vmovdqa	0x30(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
+					@ vmovdqa	0x50(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
+					@ vmovdqa	0x70(%r11),	%xmm4
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
+	vtbl.8	d9, {q15}, d3
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
+	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	r2, r2, #16		@ add	$-16,	%rdx
+
+.Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d5, {q3}, d3
+	add	r8, r8, #64-16		@ add	$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
+	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	r3, #0		@ mov	$0,%ecx
+	mov	r8, #0x30		@ mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	r9, r9, #4		@ shl	$4,%eax
+	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
+	add	r2, r2, r9
+
+	mov	r3, #1		@ mov	$1,%ecx
+	lsr	r8, r1, #1		@ shr	$1,%r8d
+	and	r8, r8, #32		@ and	$32,%r8d
+	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+@ Additional constants for converting to bsaes.
+.type	_vpaes_convert_consts,%object
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
+.size	_vpaes_convert_consts,.-_vpaes_convert_consts
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	vpaes_encrypt_key_to_bsaes
+.hidden	vpaes_encrypt_key_to_bsaes
+.type	vpaes_encrypt_key_to_bsaes,%function
+.align	4
+vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{q12}, [r2]
+	vmov.i8	q10, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	q11, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+	vmov	q0, q1
+	vtbl.8	d4, {q1}, d24
+	vtbl.8	d5, {q1}, d25
+	veor	q0, q0, q2
+	vtbl.8	d2, {q2}, d24
+	vtbl.8	d3, {q2}, d25
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, q10
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ q11, not q10.
+	veor	q0, q0, q11
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl	vpaes_decrypt_key_to_bsaes
+.hidden	vpaes_decrypt_key_to_bsaes
+.type	vpaes_decrypt_key_to_bsaes,%function
+.align	4
+vpaes_decrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+	@ computes the decryption key schedule in reverse. Additionally,
+	@ aes-x86_64.pl shares some transformations, so we must only partially
+	@ invert vpaes's transformations. In general, vpaes computes in a
+	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	adr	r2, .Lk_decrypt_transform
+	adr	r3, .Lk_sr+0x30
+	adr	r11, .Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
+	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ Undo the basis change and reapply the S-box affine transform. See
+	@ .Lschedule_mangle_last.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+	@ it simultaneously inverts MixColumns and the S-box affine transform.
+	@ See .Lk_dksd through .Lk_dks9.
+.Loop_dec_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+	@ forwards cancels inverting for which direction we cycle r3. We use r3
+	@ rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #64-16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_dec_key_to_bsaes_last
+
+	@ Undo the basis change and reapply the S-box affine transform.
+	bl	_vpaes_schedule_transform
+
+	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+	@ combine the two operations in .Lk_decrypt_transform.
+	@
+	@ TODO(davidben): Where does the rotation come from?
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+
+	vst1.64	{q1}, [r0]!
+	b	.Loop_dec_key_to_bsaes
+
+.Loop_dec_key_to_bsaes_last:
+	@ The final key only inverts ShiftRows (already done in the loop). See
+	@ .Lschedule_am_decrypting. Its basis is not transformed.
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	cmp	r2, #0
+	@ r8 is passed on the stack.
+	ldr	r8, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+	mov	r9, r3
+	mov	r3, r2
+	mov	r2, r9
+
+	@ Load the IV and counter portion.
+	ldr	r7, [r8, #12]
+	vld1.8	{q7}, [r8]
+
+	bl	_vpaes_preheat
+	rev	r7, r7		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [r0]!		@ .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [r1]!
+	subs	r3, r3, #1
+	@ Update the counter.
+	add	r7, r7, #1
+	rev	r9, r7
+	vmov.32	d15[1], r9
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-armv8-apple.S b/gen/bcm/vpaes-armv8-apple.S
new file mode 100644
index 0000000..a108a96
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-apple.S
@@ -0,0 +1,1224 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.section	__TEXT,__const
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+.globl	_vpaes_encrypt
+.private_extern	_vpaes_encrypt
+
+.align	4
+_vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+
+
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v17.16b, #0x0f
+	adrp	x11, Lk_dipt@PAGE
+	add	x11, x11, Lk_dipt@PAGEOFF
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
+	ret
+
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr@PAGE
+	add	x10, x10, Lk_sr@PAGEOFF
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward@PAGE+48
+	add	x10, x10, Lk_mc_forward@PAGEOFF+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Ldec_entry
+
+.align	4
+Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+.globl	_vpaes_decrypt
+.private_extern	_vpaes_decrypt
+
+.align	4
+_vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// v14-v15 input, v0-v1 output
+
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr@PAGE
+	add	x10, x10, Lk_sr@PAGEOFF
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward@PAGE+48
+	add	x10, x10, Lk_mc_forward@PAGEOFF+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	Ldec_2x_entry
+
+.align	4
+Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1@PAGE
+	add	x11, x11, Lk_sb1@PAGEOFF
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd@PAGE
+	add	x10, x10, Lk_dksd@PAGEOFF
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward@PAGE
+	add	x11, x11, Lk_mc_forward@PAGEOFF
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, Lk_sr@PAGEOFF
+
+	add	x8, x8, x10
+	cbnz	w3, Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	Lschedule_go
+
+Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, Lk_deskew@PAGEOFF
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, Lk_opt@PAGEOFF
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	Lschedule_mangle_both
+.align	4
+Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+
+.align	4
+_vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_vpaes_set_decrypt_key
+.private_extern	_vpaes_set_decrypt_key
+
+.align	4
+_vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_vpaes_cbc_encrypt
+.private_extern	_vpaes_cbc_encrypt
+
+.align	4
+_vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	Lcbc_enc_loop
+
+.align	4
+Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	Lcbc_dec_done
+
+.align	4
+Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern	_vpaes_ctr32_encrypt_blocks
+
+.align	4
+_vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/vpaes-armv8-linux.S b/gen/bcm/vpaes-armv8-linux.S
new file mode 100644
index 0000000..c343f00
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-linux.S
@@ -0,0 +1,1224 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+.Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align	4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type	_vpaes_decrypt_preheat,%function
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	adrp	x11, .Lk_dipt
+	add	x11, x11, :lo12:.Lk_dipt
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
+	ret
+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, .Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, .Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Ldec_entry
+
+.align	4
+.Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type	_vpaes_decrypt_2x,%function
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, .Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, .Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	.Ldec_2x_entry
+
+.align	4
+.Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, .Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, .Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, .Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, .Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+	cbnz	w3, .Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+.Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+.Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+.Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+.Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, .Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	w3, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, .Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, .Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+.globl	vpaes_cbc_encrypt
+.hidden	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,%function
+.align	4
+vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, .Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	.Lcbc_enc_loop
+
+.align	4
+.Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	.Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+.Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type	vpaes_cbc_decrypt,%function
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	.Lcbc_dec_done
+
+.align	4
+.Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	.Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, .Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// .Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-armv8-win.S b/gen/bcm/vpaes-armv8-win.S
new file mode 100644
index 0000000..d399d22
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-win.S
@@ -0,0 +1,1262 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.section	.rodata
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.def _vpaes_encrypt_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.def _vpaes_encrypt_core
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+.globl	vpaes_encrypt
+
+.def vpaes_encrypt
+   .type 32
+.endef
+.align	4
+vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.def _vpaes_encrypt_2x
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+
+.def _vpaes_decrypt_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v17.16b, #0x0f
+	adrp	x11, Lk_dipt
+	add	x11, x11, :lo12:Lk_dipt
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
+	ret
+
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.def _vpaes_decrypt_core
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr
+	add	x10, x10, :lo12:Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward+48
+	add	x10, x10, :lo12:Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Ldec_entry
+
+.align	4
+Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+.globl	vpaes_decrypt
+
+.def vpaes_decrypt
+   .type 32
+.endef
+.align	4
+vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// v14-v15 input, v0-v1 output
+.def _vpaes_decrypt_2x
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr
+	add	x10, x10, :lo12:Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward+48
+	add	x10, x10, :lo12:Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	Ldec_2x_entry
+
+.align	4
+Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.def _vpaes_key_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1
+	add	x11, x11, :lo12:Lk_sb1
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd
+	add	x10, x10, :lo12:Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward
+	add	x11, x11, :lo12:Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+.def _vpaes_schedule_core
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:Lk_sr
+
+	add	x8, x8, x10
+	cbnz	w3, Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	Lschedule_go
+
+Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:Lk_deskew
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.def _vpaes_schedule_192_smear
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.def _vpaes_schedule_round
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.def _vpaes_schedule_transform
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.def _vpaes_schedule_mangle
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	Lschedule_mangle_both
+.align	4
+Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	vpaes_set_encrypt_key
+
+.def vpaes_set_encrypt_key
+   .type 32
+.endef
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	vpaes_set_decrypt_key
+
+.def vpaes_set_decrypt_key
+   .type 32
+.endef
+.align	4
+vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	vpaes_cbc_encrypt
+
+.def vpaes_cbc_encrypt
+   .type 32
+.endef
+.align	4
+vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	Lcbc_enc_loop
+
+.align	4
+Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.def vpaes_cbc_decrypt
+   .type 32
+.endef
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	Lcbc_dec_done
+
+.align	4
+Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	vpaes_ctr32_encrypt_blocks
+
+.def vpaes_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S
new file mode 100644
index 0000000..4d2c485
--- /dev/null
+++ b/gen/bcm/vpaes-x86-apple.S
@@ -0,0 +1,680 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align	6,0x90
+L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	6,0x90
+.private_extern	__vpaes_preheat
+.align	4
+__vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.private_extern	__vpaes_encrypt_core
+.align	4
+__vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+	pxor	%xmm5,%xmm2
+	psrld	$4,%xmm1
+	addl	$16,%edx
+.byte	102,15,56,0,193
+	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
+	jmp	L000enc_entry
+.align	4,0x90
+L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	64(%ebp),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	subl	$1,%eax
+	pxor	%xmm3,%xmm0
+L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.private_extern	__vpaes_decrypt_core
+.align	4
+__vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	L002dec_entry
+.align	4,0x90
+L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
+L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	psrld	$4,%xmm1
+.byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.private_extern	__vpaes_schedule_core
+.align	4
+__vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	__vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	L005schedule_go
+L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+L005schedule_go:
+	cmpl	$192,%eax
+	ja	L006schedule_256
+	je	L007schedule_192
+L008schedule_128:
+	movl	$10,%eax
+L009loop_schedule_128:
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	jmp	L009loop_schedule_128
+.align	4,0x90
+L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	__vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+L011loop_schedule_192:
+	call	__vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	jmp	L011loop_schedule_192
+.align	4,0x90
+L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	__vpaes_schedule_transform
+	movl	$7,%eax
+L012loop_schedule_256:
+	call	__vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	L012loop_schedule_256
+.align	4,0x90
+L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	__vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.private_extern	__vpaes_schedule_192_smear
+.align	4
+__vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm1
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+.private_extern	__vpaes_schedule_round
+.align	4
+__vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.private_extern	__vpaes_schedule_transform
+.align	4
+__vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.private_extern	__vpaes_schedule_mangle
+.align	4
+__vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	L015schedule_mangle_both
+.align	4,0x90
+L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+.align	4
+_vpaes_set_encrypt_key:
+L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L016pic
+L016pic:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	L_vpaes_consts+0x30-L017pic_point,%ebp
+	call	__vpaes_schedule_core
+L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_set_decrypt_key
+.private_extern	_vpaes_set_decrypt_key
+.align	4
+_vpaes_set_decrypt_key:
+L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	L_vpaes_consts+0x30-L018pic_point,%ebp
+	call	__vpaes_schedule_core
+L018pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_encrypt
+.private_extern	_vpaes_encrypt
+.align	4
+_vpaes_encrypt:
+L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L019pic
+L019pic:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	leal	L_vpaes_consts+0x30-L020pic_point,%ebp
+	call	__vpaes_preheat
+L020pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	__vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_decrypt
+.private_extern	_vpaes_decrypt
+.align	4
+_vpaes_decrypt:
+L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	L_vpaes_consts+0x30-L021pic_point,%ebp
+	call	__vpaes_preheat
+L021pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	__vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_cbc_encrypt
+.private_extern	_vpaes_cbc_encrypt
+.align	4
+_vpaes_cbc_encrypt:
+L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	L022cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	L_vpaes_consts+0x30-L023pic_point,%ebp
+	call	__vpaes_preheat
+L023pic_point:
+	cmpl	$0,%ecx
+	je	L024cbc_dec_loop
+	jmp	L025cbc_enc_loop
+.align	4,0x90
+L025cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	__vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	L025cbc_enc_loop
+	jmp	L026cbc_done
+.align	4,0x90
+L024cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	__vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	L024cbc_dec_loop
+L026cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+L022cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S
new file mode 100644
index 0000000..02786a7
--- /dev/null
+++ b/gen/bcm/vpaes-x86-linux.S
@@ -0,0 +1,706 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align	64
+.L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	64
+.hidden	_vpaes_preheat
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+.hidden	_vpaes_encrypt_core
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+	pxor	%xmm5,%xmm2
+	psrld	$4,%xmm1
+	addl	$16,%edx
+.byte	102,15,56,0,193
+	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
+	jmp	.L000enc_entry
+.align	16
+.L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	64(%ebp),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	subl	$1,%eax
+	pxor	%xmm3,%xmm0
+.L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+.hidden	_vpaes_decrypt_core
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	.L002dec_entry
+.align	16
+.L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
+.L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	psrld	$4,%xmm1
+.byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+.hidden	_vpaes_schedule_core
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	.L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	.L005schedule_go
+.L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+.L005schedule_go:
+	cmpl	$192,%eax
+	ja	.L006schedule_256
+	je	.L007schedule_192
+.L008schedule_128:
+	movl	$10,%eax
+.L009loop_schedule_128:
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.L009loop_schedule_128
+.align	16
+.L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+.L011loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.L011loop_schedule_192
+.align	16
+.L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%eax
+.L012loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	.L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	.L012loop_schedule_256
+.align	16
+.L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	.L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+.L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+.hidden	_vpaes_schedule_192_smear
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm1
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.hidden	_vpaes_schedule_round
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+.hidden	_vpaes_schedule_transform
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+.hidden	_vpaes_schedule_mangle
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	.L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	.L015schedule_mangle_both
+.align	16
+.L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+.L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L016pic
+.L016pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	.L_vpaes_consts+0x30-.L017pic_point,%ebp
+	call	_vpaes_schedule_core
+.L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	.L_vpaes_consts+0x30-.L018pic_point,%ebp
+	call	_vpaes_schedule_core
+.L018pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L019pic
+.L019pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	leal	.L_vpaes_consts+0x30-.L020pic_point,%ebp
+	call	_vpaes_preheat
+.L020pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L021pic_point,%ebp
+	call	_vpaes_preheat
+.L021pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl	vpaes_cbc_encrypt
+.hidden	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	.L022cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	.L_vpaes_consts+0x30-.L023pic_point,%ebp
+	call	_vpaes_preheat
+.L023pic_point:
+	cmpl	$0,%ecx
+	je	.L024cbc_dec_loop
+	jmp	.L025cbc_enc_loop
+.align	16
+.L025cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	_vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L025cbc_enc_loop
+	jmp	.L026cbc_done
+.align	16
+.L024cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	_vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L024cbc_dec_loop
+.L026cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+.L022cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm
new file mode 100644
index 0000000..661496e
--- /dev/null
+++ b/gen/bcm/vpaes-x86-win.asm
@@ -0,0 +1,679 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+%ifdef BORINGSSL_DISPATCH_TEST
+extern	_BORINGSSL_function_hit
+%endif
+align	64
+L$_vpaes_consts:
+dd	218628480,235210255,168496130,67568393
+dd	252381056,17041926,33884169,51187212
+dd	252645135,252645135,252645135,252645135
+dd	1512730624,3266504856,1377990664,3401244816
+dd	830229760,1275146365,2969422977,3447763452
+dd	3411033600,2979783055,338359620,2782886510
+dd	4209124096,907596821,221174255,1006095553
+dd	191964160,3799684038,3164090317,1589111125
+dd	182528256,1777043520,2877432650,3265356744
+dd	1874708224,3503451415,3305285752,363511674
+dd	1606117888,3487855781,1093350906,2384367825
+dd	197121,67569157,134941193,202313229
+dd	67569157,134941193,202313229,197121
+dd	134941193,202313229,197121,67569157
+dd	202313229,197121,67569157,134941193
+dd	33619971,100992007,168364043,235736079
+dd	235736079,33619971,100992007,168364043
+dd	168364043,235736079,33619971,100992007
+dd	100992007,168364043,235736079,33619971
+dd	50462976,117835012,185207048,252579084
+dd	252314880,51251460,117574920,184942860
+dd	184682752,252054788,50987272,118359308
+dd	118099200,185467140,251790600,50727180
+dd	2946363062,528716217,1300004225,1881839624
+dd	1532713819,1532713819,1532713819,1532713819
+dd	3602276352,4288629033,3737020424,4153884961
+dd	1354558464,32357713,2958822624,3775749553
+dd	1201988352,132424512,1572796698,503232858
+dd	2213177600,1597421020,4103937655,675398315
+dd	2749646592,4273543773,1511898873,121693092
+dd	3040248576,1103263732,2871565598,1608280554
+dd	2236667136,2588920351,482954393,64377734
+dd	3069987328,291237287,2117370568,3650299247
+dd	533321216,3573750986,2572112006,1401264716
+dd	1339849704,2721158661,548607111,3445553514
+dd	2128193280,3054596040,2183486460,1257083700
+dd	655635200,1165381986,3923443150,2344132524
+dd	190078720,256924420,290342170,357187870
+dd	1610966272,2263057382,4103205268,309794674
+dd	2592527872,2233205587,1335446729,3402964816
+dd	3973531904,3225098121,3002836325,1918774430
+dd	3870401024,2102906079,2284471353,4117666579
+dd	617007872,1021508343,366931923,691083277
+dd	2528395776,3491914898,2968704004,1613121270
+dd	3445188352,3247741094,844474987,4093578302
+dd	651481088,1190302358,1689581232,574775300
+dd	4289380608,206939853,2555985458,2489840491
+dd	2130264064,327674451,3566485037,3349835193
+dd	2470714624,316102159,3636825756,3393945945
+db	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+db	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+db	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+db	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+db	118,101,114,115,105,116,121,41,0
+align	64
+align	16
+__vpaes_preheat:
+	add	ebp,DWORD [esp]
+	movdqa	xmm7,[ebp-48]
+	movdqa	xmm6,[ebp-16]
+	ret
+align	16
+__vpaes_encrypt_core:
+	mov	ecx,16
+	mov	eax,DWORD [240+edx]
+	movdqa	xmm1,xmm6
+	movdqa	xmm2,[ebp]
+	pandn	xmm1,xmm0
+	pand	xmm0,xmm6
+	movdqu	xmm5,[edx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebp]
+	pxor	xmm2,xmm5
+	psrld	xmm1,4
+	add	edx,16
+db	102,15,56,0,193
+	lea	ebx,[192+ebp]
+	pxor	xmm0,xmm2
+	jmp	NEAR L$000enc_entry
+align	16
+L$001enc_loop:
+	movdqa	xmm4,[32+ebp]
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,226
+db	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,[64+ebp]
+	pxor	xmm0,xmm4
+	movdqa	xmm1,[ecx*1+ebx-64]
+db	102,15,56,0,234
+	movdqa	xmm2,[80+ebp]
+	movdqa	xmm4,[ecx*1+ebx]
+db	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+db	102,15,56,0,193
+	add	edx,16
+	pxor	xmm0,xmm2
+db	102,15,56,0,220
+	add	ecx,16
+	pxor	xmm3,xmm0
+db	102,15,56,0,193
+	and	ecx,48
+	sub	eax,1
+	pxor	xmm0,xmm3
+L$000enc_entry:
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,[ebp-32]
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm6
+db	102,15,56,0,232
+	movdqa	xmm3,xmm7
+	pxor	xmm0,xmm1
+db	102,15,56,0,217
+	movdqa	xmm4,xmm7
+	pxor	xmm3,xmm5
+db	102,15,56,0,224
+	movdqa	xmm2,xmm7
+	pxor	xmm4,xmm5
+db	102,15,56,0,211
+	movdqa	xmm3,xmm7
+	pxor	xmm2,xmm0
+db	102,15,56,0,220
+	movdqu	xmm5,[edx]
+	pxor	xmm3,xmm1
+	jnz	NEAR L$001enc_loop
+	movdqa	xmm4,[96+ebp]
+	movdqa	xmm0,[112+ebp]
+db	102,15,56,0,226
+	pxor	xmm4,xmm5
+db	102,15,56,0,195
+	movdqa	xmm1,[64+ecx*1+ebx]
+	pxor	xmm0,xmm4
+db	102,15,56,0,193
+	ret
+align	16
+__vpaes_decrypt_core:
+	lea	ebx,[608+ebp]
+	mov	eax,DWORD [240+edx]
+	movdqa	xmm1,xmm6
+	movdqa	xmm2,[ebx-64]
+	pandn	xmm1,xmm0
+	mov	ecx,eax
+	psrld	xmm1,4
+	movdqu	xmm5,[edx]
+	shl	ecx,4
+	pand	xmm0,xmm6
+db	102,15,56,0,208
+	movdqa	xmm0,[ebx-48]
+	xor	ecx,48
+db	102,15,56,0,193
+	and	ecx,48
+	pxor	xmm2,xmm5
+	movdqa	xmm5,[176+ebp]
+	pxor	xmm0,xmm2
+	add	edx,16
+	lea	ecx,[ecx*1+ebx-352]
+	jmp	NEAR L$002dec_entry
+align	16
+L$003dec_loop:
+	movdqa	xmm4,[ebx-32]
+	movdqa	xmm1,[ebx-16]
+db	102,15,56,0,226
+db	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,[ebx]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,[16+ebx]
+db	102,15,56,0,226
+db	102,15,56,0,197
+db	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,[32+ebx]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,[48+ebx]
+db	102,15,56,0,226
+db	102,15,56,0,197
+db	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,[64+ebx]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,[80+ebx]
+db	102,15,56,0,226
+db	102,15,56,0,197
+db	102,15,56,0,203
+	pxor	xmm0,xmm4
+	add	edx,16
+db	102,15,58,15,237,12
+	pxor	xmm0,xmm1
+	sub	eax,1
+L$002dec_entry:
+	movdqa	xmm1,xmm6
+	movdqa	xmm2,[ebp-32]
+	pandn	xmm1,xmm0
+	pand	xmm0,xmm6
+	psrld	xmm1,4
+db	102,15,56,0,208
+	movdqa	xmm3,xmm7
+	pxor	xmm0,xmm1
+db	102,15,56,0,217
+	movdqa	xmm4,xmm7
+	pxor	xmm3,xmm2
+db	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm7
+db	102,15,56,0,211
+	movdqa	xmm3,xmm7
+	pxor	xmm2,xmm0
+db	102,15,56,0,220
+	movdqu	xmm0,[edx]
+	pxor	xmm3,xmm1
+	jnz	NEAR L$003dec_loop
+	movdqa	xmm4,[96+ebx]
+db	102,15,56,0,226
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[112+ebx]
+	movdqa	xmm2,[ecx]
+db	102,15,56,0,195
+	pxor	xmm0,xmm4
+db	102,15,56,0,194
+	ret
+align	16
+__vpaes_schedule_core:
+	add	ebp,DWORD [esp]
+	movdqu	xmm0,[esi]
+	movdqa	xmm2,[320+ebp]
+	movdqa	xmm3,xmm0
+	lea	ebx,[ebp]
+	movdqa	[4+esp],xmm2
+	call	__vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+	test	edi,edi
+	jnz	NEAR L$004schedule_am_decrypting
+	movdqu	[edx],xmm0
+	jmp	NEAR L$005schedule_go
+L$004schedule_am_decrypting:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	movdqu	[edx],xmm3
+	xor	ecx,48
+L$005schedule_go:
+	cmp	eax,192
+	ja	NEAR L$006schedule_256
+	je	NEAR L$007schedule_192
+L$008schedule_128:
+	mov	eax,10
+L$009loop_schedule_128:
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	jmp	NEAR L$009loop_schedule_128
+align	16
+L$007schedule_192:
+	movdqu	xmm0,[8+esi]
+	call	__vpaes_schedule_transform
+	movdqa	xmm6,xmm0
+	pxor	xmm4,xmm4
+	movhlps	xmm6,xmm4
+	mov	eax,4
+L$011loop_schedule_192:
+	call	__vpaes_schedule_round
+db	102,15,58,15,198,8
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	jmp	NEAR L$011loop_schedule_192
+align	16
+L$006schedule_256:
+	movdqu	xmm0,[16+esi]
+	call	__vpaes_schedule_transform
+	mov	eax,7
+L$012loop_schedule_256:
+	call	__vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	pshufd	xmm0,xmm0,255
+	movdqa	[20+esp],xmm7
+	movdqa	xmm7,xmm6
+	call	L$_vpaes_schedule_low_round
+	movdqa	xmm7,[20+esp]
+	jmp	NEAR L$012loop_schedule_256
+align	16
+L$010schedule_mangle_last:
+	lea	ebx,[384+ebp]
+	test	edi,edi
+	jnz	NEAR L$013schedule_mangle_last_dec
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,193
+	lea	ebx,[352+ebp]
+	add	edx,32
+L$013schedule_mangle_last_dec:
+	add	edx,-16
+	pxor	xmm0,[336+ebp]
+	call	__vpaes_schedule_transform
+	movdqu	[edx],xmm0
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	ret
+align	16
+__vpaes_schedule_192_smear:
+	pshufd	xmm1,xmm6,128
+	pshufd	xmm0,xmm7,254
+	pxor	xmm6,xmm1
+	pxor	xmm1,xmm1
+	pxor	xmm6,xmm0
+	movdqa	xmm0,xmm6
+	movhlps	xmm6,xmm1
+	ret
+align	16
+__vpaes_schedule_round:
+	movdqa	xmm2,[8+esp]
+	pxor	xmm1,xmm1
+db	102,15,58,15,202,15
+db	102,15,58,15,210,15
+	pxor	xmm7,xmm1
+	pshufd	xmm0,xmm0,255
+db	102,15,58,15,192,1
+	movdqa	[8+esp],xmm2
+L$_vpaes_schedule_low_round:
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,[336+ebp]
+	movdqa	xmm4,[ebp-16]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm1,xmm4
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm4
+	movdqa	xmm2,[ebp-32]
+db	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm5
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm5
+db	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm5
+db	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm5
+db	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,[32+ebp]
+db	102,15,56,0,226
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,195
+	pxor	xmm0,xmm4
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	ret
+align	16
+__vpaes_schedule_transform:
+	movdqa	xmm2,[ebp-16]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+	movdqa	xmm2,[ebx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebx]
+db	102,15,56,0,193
+	pxor	xmm0,xmm2
+	ret
+align	16
+__vpaes_schedule_mangle:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,[128+ebp]
+	test	edi,edi
+	jnz	NEAR L$014schedule_mangle_dec
+	add	edx,16
+	pxor	xmm4,[336+ebp]
+db	102,15,56,0,229
+	movdqa	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+	jmp	NEAR L$015schedule_mangle_both
+align	16
+L$014schedule_mangle_dec:
+	movdqa	xmm2,[ebp-16]
+	lea	esi,[416+ebp]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm4
+	psrld	xmm1,4
+	pand	xmm4,xmm2
+	movdqa	xmm2,[esi]
+db	102,15,56,0,212
+	movdqa	xmm3,[16+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[32+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[48+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[64+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[80+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[96+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[112+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	add	edx,-16
+L$015schedule_mangle_both:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	add	ecx,-16
+	and	ecx,48
+	movdqu	[edx],xmm3
+	ret
+global	_vpaes_set_encrypt_key
+align	16
+_vpaes_set_encrypt_key:
+L$_vpaes_set_encrypt_key_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$016pic
+L$016pic:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	eax,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	mov	ebx,eax
+	shr	ebx,5
+	add	ebx,5
+	mov	DWORD [240+edx],ebx
+	mov	ecx,48
+	mov	edi,0
+	lea	ebp,[(L$_vpaes_consts+0x30-L$017pic_point)]
+	call	__vpaes_schedule_core
+L$017pic_point:
+	mov	esp,DWORD [48+esp]
+	xor	eax,eax
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_vpaes_set_decrypt_key
+align	16
+_vpaes_set_decrypt_key:
+L$_vpaes_set_decrypt_key_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	eax,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	mov	ebx,eax
+	shr	ebx,5
+	add	ebx,5
+	mov	DWORD [240+edx],ebx
+	shl	ebx,4
+	lea	edx,[16+ebx*1+edx]
+	mov	edi,1
+	mov	ecx,eax
+	shr	ecx,1
+	and	ecx,32
+	xor	ecx,32
+	lea	ebp,[(L$_vpaes_consts+0x30-L$018pic_point)]
+	call	__vpaes_schedule_core
+L$018pic_point:
+	mov	esp,DWORD [48+esp]
+	xor	eax,eax
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_vpaes_encrypt
+align	16
+_vpaes_encrypt:
+L$_vpaes_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$019pic
+L$019pic:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	lea	ebp,[(L$_vpaes_consts+0x30-L$020pic_point)]
+	call	__vpaes_preheat
+L$020pic_point:
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	edi,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	movdqu	xmm0,[esi]
+	call	__vpaes_encrypt_core
+	movdqu	[edi],xmm0
+	mov	esp,DWORD [48+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_vpaes_decrypt
+align	16
+_vpaes_decrypt:
+L$_vpaes_decrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	lea	ebp,[(L$_vpaes_consts+0x30-L$021pic_point)]
+	call	__vpaes_preheat
+L$021pic_point:
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	edi,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	movdqu	xmm0,[esi]
+	call	__vpaes_decrypt_core
+	movdqu	[edi],xmm0
+	mov	esp,DWORD [48+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_vpaes_cbc_encrypt
+align	16
+_vpaes_cbc_encrypt:
+L$_vpaes_cbc_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	sub	eax,16
+	jc	NEAR L$022cbc_abort
+	lea	ebx,[esp-56]
+	mov	ebp,DWORD [36+esp]
+	and	ebx,-16
+	mov	ecx,DWORD [40+esp]
+	xchg	ebx,esp
+	movdqu	xmm1,[ebp]
+	sub	edi,esi
+	mov	DWORD [48+esp],ebx
+	mov	DWORD [esp],edi
+	mov	DWORD [4+esp],edx
+	mov	DWORD [8+esp],ebp
+	mov	edi,eax
+	lea	ebp,[(L$_vpaes_consts+0x30-L$023pic_point)]
+	call	__vpaes_preheat
+L$023pic_point:
+	cmp	ecx,0
+	je	NEAR L$024cbc_dec_loop
+	jmp	NEAR L$025cbc_enc_loop
+align	16
+L$025cbc_enc_loop:
+	movdqu	xmm0,[esi]
+	pxor	xmm0,xmm1
+	call	__vpaes_encrypt_core
+	mov	ebx,DWORD [esp]
+	mov	edx,DWORD [4+esp]
+	movdqa	xmm1,xmm0
+	movdqu	[esi*1+ebx],xmm0
+	lea	esi,[16+esi]
+	sub	edi,16
+	jnc	NEAR L$025cbc_enc_loop
+	jmp	NEAR L$026cbc_done
+align	16
+L$024cbc_dec_loop:
+	movdqu	xmm0,[esi]
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm0
+	call	__vpaes_decrypt_core
+	mov	ebx,DWORD [esp]
+	mov	edx,DWORD [4+esp]
+	pxor	xmm0,[16+esp]
+	movdqa	xmm1,[32+esp]
+	movdqu	[esi*1+ebx],xmm0
+	lea	esi,[16+esi]
+	sub	edi,16
+	jnc	NEAR L$024cbc_dec_loop
+L$026cbc_done:
+	mov	ebx,DWORD [8+esp]
+	mov	esp,DWORD [48+esp]
+	movdqu	[ebx],xmm1
+L$022cbc_abort:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/vpaes-x86_64-apple.S b/gen/bcm/vpaes-x86_64-apple.S
new file mode 100644
index 0000000..5aea40f
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-apple.S
@@ -0,0 +1,1131 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_encrypt_core:
+
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	L$k_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	L$k_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	L$k_mc_backward(%rip),%r10
+	jmp	L$enc_entry
+
+.p2align	4
+L$enc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+L$enc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	L$enc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_encrypt_core_2x:
+
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	L$k_ipt(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	movdqu	(%r9),%xmm5
+
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,208
+.byte	102,68,15,56,0,198
+	movdqa	L$k_ipt+16(%rip),%xmm0
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,247
+	pxor	%xmm5,%xmm2
+	pxor	%xmm5,%xmm8
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+	leaq	L$k_mc_backward(%rip),%r10
+	jmp	L$enc2x_entry
+
+.p2align	4
+L$enc2x_loop:
+
+	movdqa	L$k_sb1(%rip),%xmm4
+	movdqa	L$k_sb1+16(%rip),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+	movdqa	L$k_sb2(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+	movdqa	-64(%r11,%r10,1),%xmm1
+
+.byte	102,15,56,0,234
+.byte	102,69,15,56,0,232
+	movdqa	(%r11,%r10,1),%xmm4
+
+	movdqa	L$k_sb2+16(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm6,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm13,%xmm8
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,220
+.byte	102,68,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm11
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+	pxor	%xmm11,%xmm6
+
+L$enc2x_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	L$k_inv+16(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,232
+.byte	102,68,15,56,0,238
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm1,%xmm0
+	pxor	%xmm7,%xmm6
+.byte	102,15,56,0,217
+.byte	102,68,15,56,0,223
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm10,%xmm12
+	pxor	%xmm5,%xmm3
+	pxor	%xmm13,%xmm11
+.byte	102,15,56,0,224
+.byte	102,68,15,56,0,230
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm10,%xmm8
+	pxor	%xmm5,%xmm4
+	pxor	%xmm13,%xmm12
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm0,%xmm2
+	pxor	%xmm6,%xmm8
+.byte	102,15,56,0,220
+.byte	102,69,15,56,0,220
+	movdqu	(%r9),%xmm5
+
+	pxor	%xmm1,%xmm3
+	pxor	%xmm7,%xmm11
+	jnz	L$enc2x_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	movdqa	64(%r11,%r10,1),%xmm1
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	ret
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_decrypt_core:
+
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	L$k_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	L$k_dipt+16(%rip),%xmm0
+	xorq	$0x30,%r11
+	leaq	L$k_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	L$k_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	L$dec_entry
+
+.p2align	4
+L$dec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
+
+L$dec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	L$dec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_core:
+
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	L$k_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	L$k_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	L$k_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	L$schedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	L$schedule_go
+
+L$schedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$0x30,%r8
+
+L$schedule_go:
+	cmpl	$192,%esi
+	ja	L$schedule_256
+	je	L$schedule_192
+
+
+
+
+
+
+
+
+
+
+L$schedule_128:
+	movl	$10,%esi
+
+L$oop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+L$schedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+L$oop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+L$schedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+L$oop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+L$schedule_mangle_last:
+
+	leaq	L$k_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	L$schedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	L$k_opt(%rip),%r11
+	addq	$32,%rdx
+
+L$schedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	L$k_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_192_smear:
+
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_round:
+
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	L$k_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_transform:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_schedule_mangle:
+
+	movdqa	%xmm0,%xmm4
+	movdqa	L$k_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	L$schedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	L$k_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	L$schedule_mangle_both
+.p2align	4
+L$schedule_mangle_dec:
+
+	leaq	L$k_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+L$schedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	ret
+
+
+
+
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.p2align	4
+_vpaes_set_encrypt_key:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+5(%rip)
+#endif
+
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+
+
+
+.globl	_vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+
+.p2align	4
+_vpaes_set_decrypt_key:
+
+_CET_ENDBR
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+
+
+
+.globl	_vpaes_encrypt
+.private_extern _vpaes_encrypt
+
+.p2align	4
+_vpaes_encrypt:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+4(%rip)
+#endif
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	ret
+
+
+
+.globl	_vpaes_decrypt
+.private_extern _vpaes_decrypt
+
+.p2align	4
+_vpaes_decrypt:
+
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	ret
+
+
+.globl	_vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+
+.p2align	4
+_vpaes_cbc_encrypt:
+
+_CET_ENDBR
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	L$cbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	L$cbc_dec_loop
+	jmp	L$cbc_enc_loop
+.p2align	4
+L$cbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	L$cbc_enc_loop
+	jmp	L$cbc_done
+.p2align	4
+L$cbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	L$cbc_dec_loop
+L$cbc_done:
+	movdqu	%xmm6,(%r8)
+L$cbc_abort:
+	ret
+
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+
+.p2align	4
+_vpaes_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+
+	xchgq	%rcx,%rdx
+	testq	%rcx,%rcx
+	jz	L$ctr32_abort
+	movdqu	(%r8),%xmm0
+	movdqa	L$ctr_add_one(%rip),%xmm8
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	movdqa	%xmm0,%xmm6
+	pshufb	L$rev_ctr(%rip),%xmm6
+
+	testq	$1,%rcx
+	jz	L$ctr32_prep_loop
+
+
+
+	movdqu	(%rdi),%xmm7
+	call	_vpaes_encrypt_core
+	pxor	%xmm7,%xmm0
+	paddd	%xmm8,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	subq	$1,%rcx
+	leaq	16(%rdi),%rdi
+	jz	L$ctr32_done
+
+L$ctr32_prep_loop:
+
+
+	movdqa	%xmm6,%xmm14
+	movdqa	%xmm6,%xmm15
+	paddd	%xmm8,%xmm15
+
+L$ctr32_loop:
+	movdqa	L$rev_ctr(%rip),%xmm1
+	movdqa	%xmm14,%xmm0
+	movdqa	%xmm15,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	L$ctr_add_two(%rip),%xmm3
+	pxor	%xmm1,%xmm0
+	pxor	%xmm2,%xmm6
+	paddd	%xmm3,%xmm14
+	paddd	%xmm3,%xmm15
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	movdqu	%xmm6,16(%rsi,%rdi,1)
+	subq	$2,%rcx
+	leaq	32(%rdi),%rdi
+	jnz	L$ctr32_loop
+
+L$ctr32_done:
+L$ctr32_abort:
+	ret
+
+
+
+
+
+
+
+
+
+.p2align	4
+_vpaes_preheat:
+
+	leaq	L$k_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	ret
+
+
+
+
+
+
+
+
+.section	__DATA,__const
+.p2align	6
+_vpaes_consts:
+L$k_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+L$k_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+L$k_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+L$k_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+L$k_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+L$k_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+L$k_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+L$k_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+L$k_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+L$k_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+L$k_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+L$k_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+L$k_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+L$k_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+L$k_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+L$k_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+L$k_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+L$k_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+L$k_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+L$k_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+L$k_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+L$k_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+L$k_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+L$rev_ctr:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+L$ctr_add_one:
+.quad	0x0000000000000000, 0x0000000100000000
+L$ctr_add_two:
+.quad	0x0000000000000000, 0x0000000200000000
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align	6
+
+.text	
+#endif
diff --git a/gen/bcm/vpaes-x86_64-linux.S b/gen/bcm/vpaes-x86_64-linux.S
new file mode 100644
index 0000000..019c638
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-linux.S
@@ -0,0 +1,1133 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.cfi_endproc	
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core_2x,@function
+.align	16
+_vpaes_encrypt_core_2x:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_ipt(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	movdqu	(%r9),%xmm5
+
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,208
+.byte	102,68,15,56,0,198
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,247
+	pxor	%xmm5,%xmm2
+	pxor	%xmm5,%xmm8
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc2x_entry
+
+.align	16
+.Lenc2x_loop:
+
+	movdqa	.Lk_sb1(%rip),%xmm4
+	movdqa	.Lk_sb1+16(%rip),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+	movdqa	.Lk_sb2(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+	movdqa	-64(%r11,%r10,1),%xmm1
+
+.byte	102,15,56,0,234
+.byte	102,69,15,56,0,232
+	movdqa	(%r11,%r10,1),%xmm4
+
+	movdqa	.Lk_sb2+16(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm6,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm13,%xmm8
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,220
+.byte	102,68,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm11
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+	pxor	%xmm11,%xmm6
+
+.Lenc2x_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_inv+16(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,232
+.byte	102,68,15,56,0,238
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm1,%xmm0
+	pxor	%xmm7,%xmm6
+.byte	102,15,56,0,217
+.byte	102,68,15,56,0,223
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm10,%xmm12
+	pxor	%xmm5,%xmm3
+	pxor	%xmm13,%xmm11
+.byte	102,15,56,0,224
+.byte	102,68,15,56,0,230
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm10,%xmm8
+	pxor	%xmm5,%xmm4
+	pxor	%xmm13,%xmm12
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm0,%xmm2
+	pxor	%xmm6,%xmm8
+.byte	102,15,56,0,220
+.byte	102,69,15,56,0,220
+	movdqu	(%r9),%xmm5
+
+	pxor	%xmm1,%xmm3
+	pxor	%xmm7,%xmm11
+	jnz	.Lenc2x_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	movdqa	64(%r11,%r10,1),%xmm1
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	ret
+.cfi_endproc	
+.size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$0x30,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.cfi_endproc	
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+.cfi_startproc	
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$0x30,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+.cfi_startproc	
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+.cfi_startproc	
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+.cfi_startproc	
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+.cfi_startproc	
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	ret
+.cfi_endproc	
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+5(%rip)
+#endif
+
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+.cfi_endproc	
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	ret
+.cfi_endproc	
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.hidden vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.cfi_startproc	
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+4(%rip)
+#endif
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	ret
+.cfi_endproc	
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.hidden vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.cfi_startproc	
+_CET_ENDBR
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	ret
+.cfi_endproc	
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.cfi_startproc	
+_CET_ENDBR
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	ret
+.cfi_endproc	
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,@function
+.align	16
+vpaes_ctr32_encrypt_blocks:
+.cfi_startproc	
+_CET_ENDBR
+
+	xchgq	%rcx,%rdx
+	testq	%rcx,%rcx
+	jz	.Lctr32_abort
+	movdqu	(%r8),%xmm0
+	movdqa	.Lctr_add_one(%rip),%xmm8
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	movdqa	%xmm0,%xmm6
+	pshufb	.Lrev_ctr(%rip),%xmm6
+
+	testq	$1,%rcx
+	jz	.Lctr32_prep_loop
+
+
+
+	movdqu	(%rdi),%xmm7
+	call	_vpaes_encrypt_core
+	pxor	%xmm7,%xmm0
+	paddd	%xmm8,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	subq	$1,%rcx
+	leaq	16(%rdi),%rdi
+	jz	.Lctr32_done
+
+.Lctr32_prep_loop:
+
+
+	movdqa	%xmm6,%xmm14
+	movdqa	%xmm6,%xmm15
+	paddd	%xmm8,%xmm15
+
+.Lctr32_loop:
+	movdqa	.Lrev_ctr(%rip),%xmm1
+	movdqa	%xmm14,%xmm0
+	movdqa	%xmm15,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	.Lctr_add_two(%rip),%xmm3
+	pxor	%xmm1,%xmm0
+	pxor	%xmm2,%xmm6
+	paddd	%xmm3,%xmm14
+	paddd	%xmm3,%xmm15
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	movdqu	%xmm6,16(%rsi,%rdi,1)
+	subq	$2,%rcx
+	leaq	32(%rdi),%rdi
+	jnz	.Lctr32_loop
+
+.Lctr32_done:
+.Lctr32_abort:
+	ret
+.cfi_endproc	
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+.cfi_startproc	
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	ret
+.cfi_endproc	
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.section	.rodata
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+.Lrev_ctr:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+.Lctr_add_one:
+.quad	0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+.quad	0x0000000000000000, 0x0000000200000000
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+.text	
+#endif
diff --git a/gen/bcm/vpaes-x86_64-win.asm b/gen/bcm/vpaes-x86_64-win.asm
new file mode 100644
index 0000000..ddbfb12
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-win.asm
@@ -0,0 +1,1487 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	pandn	xmm1,xmm0
+	movdqu	xmm5,XMMWORD[r9]
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+DB	102,15,56,0,193
+	pxor	xmm2,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc_entry
+
+ALIGN	16
+$L$enc_loop:
+
+	movdqa	xmm4,xmm13
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,226
+DB	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,xmm15
+	pxor	xmm0,xmm4
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+DB	102,15,56,0,234
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+	movdqa	xmm2,xmm14
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+DB	102,15,56,0,193
+	add	r9,16
+	pxor	xmm0,xmm2
+DB	102,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+DB	102,15,56,0,193
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+
+$L$enc_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm5,xmm11
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,232
+	movdqa	xmm3,xmm10
+	pxor	xmm0,xmm1
+DB	102,15,56,0,217
+	movdqa	xmm4,xmm10
+	pxor	xmm3,xmm5
+DB	102,15,56,0,224
+	movdqa	xmm2,xmm10
+	pxor	xmm4,xmm5
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm10
+	pxor	xmm2,xmm0
+DB	102,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+	pxor	xmm3,xmm1
+	jnz	NEAR $L$enc_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+DB	102,15,56,0,226
+	pxor	xmm4,xmm5
+DB	102,15,56,0,195
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+	pxor	xmm0,xmm4
+DB	102,15,56,0,193
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core_2x:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	movdqa	xmm8,xmm2
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	movdqu	xmm5,XMMWORD[r9]
+
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,208
+DB	102,68,15,56,0,198
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,193
+DB	102,15,56,0,247
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc2x_entry
+
+ALIGN	16
+$L$enc2x_loop:
+
+	movdqa	xmm4,XMMWORD[$L$k_sb1]
+	movdqa	xmm0,XMMWORD[(($L$k_sb1+16))]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+	movdqa	xmm5,XMMWORD[$L$k_sb2]
+	movdqa	xmm13,xmm5
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+
+DB	102,15,56,0,234
+DB	102,69,15,56,0,232
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+
+	movdqa	xmm2,XMMWORD[(($L$k_sb2+16))]
+	movdqa	xmm8,xmm2
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm0
+	movdqa	xmm11,xmm6
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm13
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+DB	102,15,56,0,220
+DB	102,68,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+	pxor	xmm11,xmm6
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+	pxor	xmm6,xmm11
+
+$L$enc2x_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm5,XMMWORD[(($L$k_inv+16))]
+	movdqa	xmm13,xmm5
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,232
+DB	102,68,15,56,0,238
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm7
+DB	102,15,56,0,217
+DB	102,68,15,56,0,223
+	movdqa	xmm4,xmm10
+	movdqa	xmm12,xmm10
+	pxor	xmm3,xmm5
+	pxor	xmm11,xmm13
+DB	102,15,56,0,224
+DB	102,68,15,56,0,230
+	movdqa	xmm2,xmm10
+	movdqa	xmm8,xmm10
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm13
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm2,xmm0
+	pxor	xmm8,xmm6
+DB	102,15,56,0,220
+DB	102,69,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+
+	pxor	xmm3,xmm1
+	pxor	xmm11,xmm7
+	jnz	NEAR $L$enc2x_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	ret
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_decrypt_core:
+
+	mov	r9,rdx
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_dipt]
+	pandn	xmm1,xmm0
+	mov	r11,rax
+	psrld	xmm1,4
+	movdqu	xmm5,XMMWORD[r9]
+	shl	r11,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[(($L$k_dipt+16))]
+	xor	r11,0x30
+	lea	r10,[$L$k_dsbd]
+DB	102,15,56,0,193
+	and	r11,0x30
+	pxor	xmm2,xmm5
+	movdqa	xmm5,XMMWORD[(($L$k_mc_forward+48))]
+	pxor	xmm0,xmm2
+	add	r9,16
+	add	r11,r10
+	jmp	NEAR $L$dec_entry
+
+ALIGN	16
+$L$dec_loop:
+
+
+
+	movdqa	xmm4,XMMWORD[((-32))+r10]
+	movdqa	xmm1,XMMWORD[((-16))+r10]
+DB	102,15,56,0,226
+DB	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,XMMWORD[r10]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,XMMWORD[16+r10]
+
+DB	102,15,56,0,226
+DB	102,15,56,0,197
+DB	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,XMMWORD[32+r10]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,XMMWORD[48+r10]
+
+DB	102,15,56,0,226
+DB	102,15,56,0,197
+DB	102,15,56,0,203
+	pxor	xmm0,xmm4
+	movdqa	xmm4,XMMWORD[64+r10]
+	pxor	xmm0,xmm1
+	movdqa	xmm1,XMMWORD[80+r10]
+
+DB	102,15,56,0,226
+DB	102,15,56,0,197
+DB	102,15,56,0,203
+	pxor	xmm0,xmm4
+	add	r9,16
+DB	102,15,58,15,237,12
+	pxor	xmm0,xmm1
+	sub	rax,1
+
+$L$dec_entry:
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	movdqa	xmm2,xmm11
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,208
+	movdqa	xmm3,xmm10
+	pxor	xmm0,xmm1
+DB	102,15,56,0,217
+	movdqa	xmm4,xmm10
+	pxor	xmm3,xmm2
+DB	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm10
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm10
+	pxor	xmm2,xmm0
+DB	102,15,56,0,220
+	movdqu	xmm0,XMMWORD[r9]
+	pxor	xmm3,xmm1
+	jnz	NEAR $L$dec_loop
+
+
+	movdqa	xmm4,XMMWORD[96+r10]
+DB	102,15,56,0,226
+	pxor	xmm4,xmm0
+	movdqa	xmm0,XMMWORD[112+r10]
+	movdqa	xmm2,XMMWORD[((-352))+r11]
+DB	102,15,56,0,195
+	pxor	xmm0,xmm4
+DB	102,15,56,0,194
+	ret
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_core:
+
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	xmm8,XMMWORD[$L$k_rcon]
+	movdqu	xmm0,XMMWORD[rdi]
+
+
+	movdqa	xmm3,xmm0
+	lea	r11,[$L$k_ipt]
+	call	_vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+
+	lea	r10,[$L$k_sr]
+	test	rcx,rcx
+	jnz	NEAR $L$schedule_am_decrypting
+
+
+	movdqu	XMMWORD[rdx],xmm0
+	jmp	NEAR $L$schedule_go
+
+$L$schedule_am_decrypting:
+
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,217
+	movdqu	XMMWORD[rdx],xmm3
+	xor	r8,0x30
+
+$L$schedule_go:
+	cmp	esi,192
+	ja	NEAR $L$schedule_256
+	je	NEAR $L$schedule_192
+
+
+
+
+
+
+
+
+
+
+$L$schedule_128:
+	mov	esi,10
+
+$L$oop_schedule_128:
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	NEAR $L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_192:
+	movdqu	xmm0,XMMWORD[8+rdi]
+	call	_vpaes_schedule_transform
+	movdqa	xmm6,xmm0
+	pxor	xmm4,xmm4
+	movhlps	xmm6,xmm4
+	mov	esi,4
+
+$L$oop_schedule_192:
+	call	_vpaes_schedule_round
+DB	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	NEAR $L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_256:
+	movdqu	xmm0,XMMWORD[16+rdi]
+	call	_vpaes_schedule_transform
+	mov	esi,7
+
+$L$oop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+
+
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	xmm0,xmm0,0xFF
+	movdqa	xmm5,xmm7
+	movdqa	xmm7,xmm6
+	call	_vpaes_schedule_low_round
+	movdqa	xmm7,xmm5
+
+	jmp	NEAR $L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_mangle_last:
+
+	lea	r11,[$L$k_deskew]
+	test	rcx,rcx
+	jnz	NEAR $L$schedule_mangle_last_dec
+
+
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,193
+	lea	r11,[$L$k_opt]
+	add	rdx,32
+
+$L$schedule_mangle_last_dec:
+	add	rdx,-16
+	pxor	xmm0,XMMWORD[$L$k_s63]
+	call	_vpaes_schedule_transform
+	movdqu	XMMWORD[rdx],xmm0
+
+
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_192_smear:
+
+	pshufd	xmm1,xmm6,0x80
+	pshufd	xmm0,xmm7,0xFE
+	pxor	xmm6,xmm1
+	pxor	xmm1,xmm1
+	pxor	xmm6,xmm0
+	movdqa	xmm0,xmm6
+	movhlps	xmm6,xmm1
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_round:
+
+
+	pxor	xmm1,xmm1
+DB	102,65,15,58,15,200,15
+DB	102,69,15,58,15,192,15
+	pxor	xmm7,xmm1
+
+
+	pshufd	xmm0,xmm0,0xFF
+DB	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,XMMWORD[$L$k_s63]
+
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,xmm11
+DB	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm10
+DB	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm10
+DB	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,xmm13
+DB	102,15,56,0,226
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,195
+	pxor	xmm0,xmm4
+
+
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_transform:
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,XMMWORD[r11]
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[16+r11]
+DB	102,15,56,0,193
+	pxor	xmm0,xmm2
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_mangle:
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,XMMWORD[$L$k_mc_forward]
+	test	rcx,rcx
+	jnz	NEAR $L$schedule_mangle_dec
+
+
+	add	rdx,16
+	pxor	xmm4,XMMWORD[$L$k_s63]
+DB	102,15,56,0,229
+	movdqa	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+
+	jmp	NEAR $L$schedule_mangle_both
+ALIGN	16
+$L$schedule_mangle_dec:
+
+	lea	r11,[$L$k_dksd]
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm4
+	psrld	xmm1,4
+	pand	xmm4,xmm9
+
+	movdqa	xmm2,XMMWORD[r11]
+DB	102,15,56,0,212
+	movdqa	xmm3,XMMWORD[16+r11]
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+DB	102,15,56,0,221
+
+	movdqa	xmm2,XMMWORD[32+r11]
+DB	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,XMMWORD[48+r11]
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+DB	102,15,56,0,221
+
+	movdqa	xmm2,XMMWORD[64+r11]
+DB	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,XMMWORD[80+r11]
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+DB	102,15,56,0,221
+
+	movdqa	xmm2,XMMWORD[96+r11]
+DB	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,XMMWORD[112+r11]
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+
+	add	rdx,-16
+
+$L$schedule_mangle_both:
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,217
+	add	r8,-16
+	and	r8,0x30
+	movdqu	XMMWORD[rdx],xmm3
+	ret
+
+
+
+
+
+
+global	vpaes_set_encrypt_key
+
+ALIGN	16
+vpaes_set_encrypt_key:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_set_encrypt_key:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+5))],1
+%endif
+
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$enc_key_body:
+	mov	eax,esi
+	shr	eax,5
+	add	eax,5
+	mov	DWORD[240+rdx],eax
+
+	mov	ecx,0
+	mov	r8d,0x30
+	call	_vpaes_schedule_core
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$enc_key_epilogue:
+	xor	eax,eax
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_set_encrypt_key:
+
+global	vpaes_set_decrypt_key
+
+ALIGN	16
+vpaes_set_decrypt_key:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_set_decrypt_key:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$dec_key_body:
+	mov	eax,esi
+	shr	eax,5
+	add	eax,5
+	mov	DWORD[240+rdx],eax
+	shl	eax,4
+	lea	rdx,[16+rax*1+rdx]
+
+	mov	ecx,1
+	mov	r8d,esi
+	shr	r8d,1
+	and	r8d,32
+	xor	r8d,32
+	call	_vpaes_schedule_core
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$dec_key_epilogue:
+	xor	eax,eax
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_set_decrypt_key:
+
+global	vpaes_encrypt
+
+ALIGN	16
+vpaes_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+4))],1
+%endif
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$enc_body:
+	movdqu	xmm0,XMMWORD[rdi]
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	XMMWORD[rsi],xmm0
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$enc_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_encrypt:
+
+global	vpaes_decrypt
+
+ALIGN	16
+vpaes_decrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_decrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$dec_body:
+	movdqu	xmm0,XMMWORD[rdi]
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	XMMWORD[rsi],xmm0
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$dec_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_decrypt:
+global	vpaes_cbc_encrypt
+
+ALIGN	16
+vpaes_cbc_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_cbc_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	xchg	rdx,rcx
+	sub	rcx,16
+	jc	NEAR $L$cbc_abort
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$cbc_body:
+	movdqu	xmm6,XMMWORD[r8]
+	sub	rsi,rdi
+	call	_vpaes_preheat
+	cmp	r9d,0
+	je	NEAR $L$cbc_dec_loop
+	jmp	NEAR $L$cbc_enc_loop
+ALIGN	16
+$L$cbc_enc_loop:
+	movdqu	xmm0,XMMWORD[rdi]
+	pxor	xmm0,xmm6
+	call	_vpaes_encrypt_core
+	movdqa	xmm6,xmm0
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	lea	rdi,[16+rdi]
+	sub	rcx,16
+	jnc	NEAR $L$cbc_enc_loop
+	jmp	NEAR $L$cbc_done
+ALIGN	16
+$L$cbc_dec_loop:
+	movdqu	xmm0,XMMWORD[rdi]
+	movdqa	xmm7,xmm0
+	call	_vpaes_decrypt_core
+	pxor	xmm0,xmm6
+	movdqa	xmm6,xmm7
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	lea	rdi,[16+rdi]
+	sub	rcx,16
+	jnc	NEAR $L$cbc_dec_loop
+$L$cbc_done:
+	movdqu	XMMWORD[r8],xmm6
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$cbc_epilogue:
+$L$cbc_abort:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_cbc_encrypt:
+global	vpaes_ctr32_encrypt_blocks
+
+ALIGN	16
+vpaes_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_vpaes_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+
+	xchg	rdx,rcx
+	test	rcx,rcx
+	jz	NEAR $L$ctr32_abort
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$ctr32_body:
+	movdqu	xmm0,XMMWORD[r8]
+	movdqa	xmm8,XMMWORD[$L$ctr_add_one]
+	sub	rsi,rdi
+	call	_vpaes_preheat
+	movdqa	xmm6,xmm0
+	pshufb	xmm6,XMMWORD[$L$rev_ctr]
+
+	test	rcx,1
+	jz	NEAR $L$ctr32_prep_loop
+
+
+
+	movdqu	xmm7,XMMWORD[rdi]
+	call	_vpaes_encrypt_core
+	pxor	xmm0,xmm7
+	paddd	xmm6,xmm8
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	sub	rcx,1
+	lea	rdi,[16+rdi]
+	jz	NEAR $L$ctr32_done
+
+$L$ctr32_prep_loop:
+
+
+	movdqa	xmm14,xmm6
+	movdqa	xmm15,xmm6
+	paddd	xmm15,xmm8
+
+$L$ctr32_loop:
+	movdqa	xmm1,XMMWORD[$L$rev_ctr]
+	movdqa	xmm0,xmm14
+	movdqa	xmm6,xmm15
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm3,XMMWORD[$L$ctr_add_two]
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm2
+	paddd	xmm14,xmm3
+	paddd	xmm15,xmm3
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	movdqu	XMMWORD[16+rdi*1+rsi],xmm6
+	sub	rcx,2
+	lea	rdi,[32+rdi]
+	jnz	NEAR $L$ctr32_loop
+
+$L$ctr32_done:
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$ctr32_epilogue:
+$L$ctr32_abort:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_vpaes_ctr32_encrypt_blocks:
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_preheat:
+
+	lea	r10,[$L$k_s0F]
+	movdqa	xmm10,XMMWORD[((-32))+r10]
+	movdqa	xmm11,XMMWORD[((-16))+r10]
+	movdqa	xmm9,XMMWORD[r10]
+	movdqa	xmm13,XMMWORD[48+r10]
+	movdqa	xmm12,XMMWORD[64+r10]
+	movdqa	xmm15,XMMWORD[80+r10]
+	movdqa	xmm14,XMMWORD[96+r10]
+	ret
+
+
+
+
+
+
+
+
+section	.rdata rdata align=8
+ALIGN	64
+_vpaes_consts:
+$L$k_inv:
+	DQ	0x0E05060F0D080180,0x040703090A0B0C02
+	DQ	0x01040A060F0B0780,0x030D0E0C02050809
+
+$L$k_s0F:
+	DQ	0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F
+
+$L$k_ipt:
+	DQ	0xC2B2E8985A2A7000,0xCABAE09052227808
+	DQ	0x4C01307D317C4D00,0xCD80B1FCB0FDCC81
+
+$L$k_sb1:
+	DQ	0xB19BE18FCB503E00,0xA5DF7A6E142AF544
+	DQ	0x3618D415FAE22300,0x3BF7CCC10D2ED9EF
+$L$k_sb2:
+	DQ	0xE27A93C60B712400,0x5EB7E955BC982FCD
+	DQ	0x69EB88400AE12900,0xC2A163C8AB82234A
+$L$k_sbo:
+	DQ	0xD0D26D176FBDC700,0x15AABF7AC502A878
+	DQ	0xCFE474A55FBB6A00,0x8E1E90D1412B35FA
+
+$L$k_mc_forward:
+	DQ	0x0407060500030201,0x0C0F0E0D080B0A09
+	DQ	0x080B0A0904070605,0x000302010C0F0E0D
+	DQ	0x0C0F0E0D080B0A09,0x0407060500030201
+	DQ	0x000302010C0F0E0D,0x080B0A0904070605
+
+$L$k_mc_backward:
+	DQ	0x0605040702010003,0x0E0D0C0F0A09080B
+	DQ	0x020100030E0D0C0F,0x0A09080B06050407
+	DQ	0x0E0D0C0F0A09080B,0x0605040702010003
+	DQ	0x0A09080B06050407,0x020100030E0D0C0F
+
+$L$k_sr:
+	DQ	0x0706050403020100,0x0F0E0D0C0B0A0908
+	DQ	0x030E09040F0A0500,0x0B06010C07020D08
+	DQ	0x0F060D040B020900,0x070E050C030A0108
+	DQ	0x0B0E0104070A0D00,0x0306090C0F020508
+
+$L$k_rcon:
+	DQ	0x1F8391B9AF9DEEB6,0x702A98084D7C7D81
+
+$L$k_s63:
+	DQ	0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B
+
+$L$k_opt:
+	DQ	0xFF9F4929D6B66000,0xF7974121DEBE6808
+	DQ	0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0
+
+$L$k_deskew:
+	DQ	0x07E4A34047A4E300,0x1DFEB95A5DBEF91A
+	DQ	0x5F36B5DC83EA6900,0x2841C2ABF49D1E77
+
+
+
+
+
+$L$k_dksd:
+	DQ	0xFEB91A5DA3E44700,0x0740E3A45A1DBEF9
+	DQ	0x41C277F4B5368300,0x5FDC69EAAB289D1E
+$L$k_dksb:
+	DQ	0x9A4FCA1F8550D500,0x03D653861CC94C99
+	DQ	0x115BEDA7B6FC4A00,0xD993256F7E3482C8
+$L$k_dkse:
+	DQ	0xD5031CCA1FC9D600,0x53859A4C994F5086
+	DQ	0xA23196054FDC7BE8,0xCD5EF96A20B31487
+$L$k_dks9:
+	DQ	0xB6116FC87ED9A700,0x4AED933482255BFC
+	DQ	0x4576516227143300,0x8BB89FACE9DAFDCE
+
+
+
+
+
+$L$k_dipt:
+	DQ	0x0F505B040B545F00,0x154A411E114E451A
+	DQ	0x86E383E660056500,0x12771772F491F194
+
+$L$k_dsb9:
+	DQ	0x851C03539A86D600,0xCAD51F504F994CC9
+	DQ	0xC03B1789ECD74900,0x725E2C9EB2FBA565
+$L$k_dsbd:
+	DQ	0x7D57CCDFE6B1A200,0xF56E9B13882A4439
+	DQ	0x3CE2FAF724C6CB00,0x2931180D15DEEFD3
+$L$k_dsbb:
+	DQ	0xD022649296B44200,0x602646F6B0F2D404
+	DQ	0xC19498A6CD596700,0xF3FF0C3E3255AA6B
+$L$k_dsbe:
+	DQ	0x46F2929626D4D000,0x2242600464B4F6B0
+	DQ	0x0C55A6CDFFAAC100,0x9467F36B98593E32
+$L$k_dsbo:
+	DQ	0x1387EA537EF94000,0xC7AA6DB9D4943E2D
+	DQ	0x12D7560F93441D00,0xCA4B8159D8C58E9C
+
+
+$L$rev_ctr:
+	DQ	0x0706050403020100,0x0c0d0e0f0b0a0908
+
+
+$L$ctr_add_one:
+	DQ	0x0000000000000000,0x0000000100000000
+$L$ctr_add_two:
+	DQ	0x0000000000000000,0x0000000200000000
+
+	DB	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+	DB	111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54
+	DB	52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97
+	DB	109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32
+	DB	85,110,105,118,101,114,115,105,116,121,41,0
+ALIGN	64
+
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[16+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+	lea	rax,[184+rax]
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_set_decrypt_key wrt ..imagebase
+	DD	$L$SEH_end_vpaes_set_decrypt_key wrt ..imagebase
+	DD	$L$SEH_info_vpaes_set_decrypt_key wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_encrypt wrt ..imagebase
+	DD	$L$SEH_end_vpaes_encrypt wrt ..imagebase
+	DD	$L$SEH_info_vpaes_encrypt wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_decrypt wrt ..imagebase
+	DD	$L$SEH_end_vpaes_decrypt wrt ..imagebase
+	DD	$L$SEH_info_vpaes_decrypt wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_cbc_encrypt wrt ..imagebase
+	DD	$L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase
+	DD	$L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase
+
+	DD	$L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_vpaes_set_encrypt_key:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_set_decrypt_key:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$dec_key_body wrt ..imagebase,$L$dec_key_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_encrypt:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$enc_body wrt ..imagebase,$L$enc_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_decrypt:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$dec_body wrt ..imagebase,$L$dec_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_cbc_encrypt:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_ctr32_encrypt_blocks:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
new file mode 100644
index 0000000..f991f6c
--- /dev/null
+++ b/gen/bcm/x86-mont-apple.S
@@ -0,0 +1,484 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+.align	4
+_bn_mul_mont:
+L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+	jmp	L002page_walk_done
+.align	4,0x90
+L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+L002page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	call	L003PIC_me_up
+L003PIC_me_up:
+	popl	%eax
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	L004non_sse2
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	4,0x90
+L0051st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	L0051st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+L006outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+L007inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	L007inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	L006outer
+	emms
+	jmp	L008common_tail
+.align	4,0x90
+L004non_sse2:
+	movl	8(%esp),%esi
+	leal	1(%ebx),%ebp
+	movl	12(%esp),%edi
+	xorl	%ecx,%ecx
+	movl	%esi,%edx
+	andl	$1,%ebp
+	subl	%edi,%edx
+	leal	4(%edi,%ebx,4),%eax
+	orl	%edx,%ebp
+	movl	(%edi),%edi
+	jz	L009bn_sqr_mont
+	movl	%eax,28(%esp)
+	movl	(%esi),%eax
+	xorl	%edx,%edx
+.align	4,0x90
+L010mull:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%eax,%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	movl	(%esi,%ecx,4),%eax
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	L010mull
+	movl	%edx,%ebp
+	mull	%edi
+	movl	20(%esp),%edi
+	addl	%ebp,%eax
+	movl	16(%esp),%esi
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	movl	%eax,32(%esp,%ebx,4)
+	xorl	%ecx,%ecx
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	movl	(%esi),%eax
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	incl	%ecx
+	jmp	L0112ndmadd
+.align	4,0x90
+L0121stmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	L0121stmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	xorl	%ecx,%ecx
+	addl	36(%esp,%ebx,4),%edx
+	movl	%ebp,32(%esp,%ebx,4)
+	adcl	$0,%ecx
+	movl	(%esi),%eax
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	movl	$1,%ecx
+.align	4,0x90
+L0112ndmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	L0112ndmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	xorl	%eax,%eax
+	movl	12(%esp),%ecx
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	leal	4(%ecx),%ecx
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	28(%esp),%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	L008common_tail
+	movl	(%ecx),%edi
+	movl	8(%esp),%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	movl	(%esi),%eax
+	jmp	L0121stmadd
+.align	4,0x90
+L009bn_sqr_mont:
+	movl	%ebx,(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edi,%eax
+	mull	%edi
+	movl	%eax,32(%esp)
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+	incl	%ecx
+.align	4,0x90
+L013sqr:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	leal	(%ebx,%eax,2),%ebp
+	shrl	$31,%eax
+	cmpl	(%esp),%ecx
+	movl	%eax,%ebx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	L013sqr
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	leal	(%ebx,%eax,2),%ebp
+	imull	32(%esp),%edi
+	shrl	$31,%eax
+	movl	%ebp,32(%esp,%ecx,4)
+	leal	(%eax,%edx,2),%ebp
+	movl	(%esi),%eax
+	shrl	$31,%edx
+	movl	%ebp,36(%esp,%ecx,4)
+	movl	%edx,40(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	%ecx,%ebx
+	adcl	$0,%edx
+	movl	4(%esi),%eax
+	movl	$1,%ecx
+.align	4,0x90
+L0143rdmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	4(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%edx,%ebp
+	mull	%edi
+	addl	36(%esp,%ecx,4),%ebp
+	leal	2(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	L0143rdmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	movl	12(%esp),%ecx
+	xorl	%eax,%eax
+	movl	8(%esp),%esi
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	%ebx,%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	L008common_tail
+	movl	4(%esi,%ecx,4),%edi
+	leal	1(%ecx),%ecx
+	movl	%edi,%eax
+	movl	%ecx,12(%esp)
+	mull	%edi
+	addl	32(%esp,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%eax,32(%esp,%ecx,4)
+	xorl	%ebp,%ebp
+	cmpl	%ebx,%ecx
+	leal	1(%ecx),%ecx
+	je	L015sqrlast
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+.align	4,0x90
+L016sqradd:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	(%eax,%eax,1),%ebp
+	adcl	$0,%edx
+	shrl	$31,%eax
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%eax
+	addl	%ebx,%ebp
+	adcl	$0,%eax
+	cmpl	(%esp),%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%eax,%ebx
+	jle	L016sqradd
+	movl	%edx,%ebp
+	addl	%edx,%edx
+	shrl	$31,%ebp
+	addl	%ebx,%edx
+	adcl	$0,%ebp
+L015sqrlast:
+	movl	20(%esp),%edi
+	movl	16(%esp),%esi
+	imull	32(%esp),%edi
+	addl	32(%esp,%ecx,4),%edx
+	movl	(%esi),%eax
+	adcl	$0,%ebp
+	movl	%edx,32(%esp,%ecx,4)
+	movl	%ebp,36(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	leal	-1(%ecx),%ebx
+	adcl	$0,%edx
+	movl	$1,%ecx
+	movl	4(%esi),%eax
+	jmp	L0143rdmadd
+.align	4,0x90
+L008common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	4,0x90
+L017sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	L017sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	L018copy
+.align	4,0x90
+L018copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	L018copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol	_OPENSSL_ia32cap_P
+.long	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
new file mode 100644
index 0000000..e6b4ef5
--- /dev/null
+++ b/gen/bcm/x86-mont-linux.S
@@ -0,0 +1,482 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	.L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
+.L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	call	.L003PIC_me_up
+.L003PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L004non_sse2
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	16
+.L0051st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	.L0051st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+.L006outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+.L007inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	.L007inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	.L006outer
+	emms
+	jmp	.L008common_tail
+.align	16
+.L004non_sse2:
+	movl	8(%esp),%esi
+	leal	1(%ebx),%ebp
+	movl	12(%esp),%edi
+	xorl	%ecx,%ecx
+	movl	%esi,%edx
+	andl	$1,%ebp
+	subl	%edi,%edx
+	leal	4(%edi,%ebx,4),%eax
+	orl	%edx,%ebp
+	movl	(%edi),%edi
+	jz	.L009bn_sqr_mont
+	movl	%eax,28(%esp)
+	movl	(%esi),%eax
+	xorl	%edx,%edx
+.align	16
+.L010mull:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%eax,%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	movl	(%esi,%ecx,4),%eax
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L010mull
+	movl	%edx,%ebp
+	mull	%edi
+	movl	20(%esp),%edi
+	addl	%ebp,%eax
+	movl	16(%esp),%esi
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	movl	%eax,32(%esp,%ebx,4)
+	xorl	%ecx,%ecx
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	movl	(%esi),%eax
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	incl	%ecx
+	jmp	.L0112ndmadd
+.align	16
+.L0121stmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L0121stmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	xorl	%ecx,%ecx
+	addl	36(%esp,%ebx,4),%edx
+	movl	%ebp,32(%esp,%ebx,4)
+	adcl	$0,%ecx
+	movl	(%esi),%eax
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	movl	$1,%ecx
+.align	16
+.L0112ndmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0112ndmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	xorl	%eax,%eax
+	movl	12(%esp),%ecx
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	leal	4(%ecx),%ecx
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	28(%esp),%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L008common_tail
+	movl	(%ecx),%edi
+	movl	8(%esp),%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	movl	(%esi),%eax
+	jmp	.L0121stmadd
+.align	16
+.L009bn_sqr_mont:
+	movl	%ebx,(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edi,%eax
+	mull	%edi
+	movl	%eax,32(%esp)
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+	incl	%ecx
+.align	16
+.L013sqr:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	leal	(%ebx,%eax,2),%ebp
+	shrl	$31,%eax
+	cmpl	(%esp),%ecx
+	movl	%eax,%ebx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L013sqr
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	leal	(%ebx,%eax,2),%ebp
+	imull	32(%esp),%edi
+	shrl	$31,%eax
+	movl	%ebp,32(%esp,%ecx,4)
+	leal	(%eax,%edx,2),%ebp
+	movl	(%esi),%eax
+	shrl	$31,%edx
+	movl	%ebp,36(%esp,%ecx,4)
+	movl	%edx,40(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	%ecx,%ebx
+	adcl	$0,%edx
+	movl	4(%esi),%eax
+	movl	$1,%ecx
+.align	16
+.L0143rdmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	4(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%edx,%ebp
+	mull	%edi
+	addl	36(%esp,%ecx,4),%ebp
+	leal	2(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0143rdmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	movl	12(%esp),%ecx
+	xorl	%eax,%eax
+	movl	8(%esp),%esi
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	%ebx,%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L008common_tail
+	movl	4(%esi,%ecx,4),%edi
+	leal	1(%ecx),%ecx
+	movl	%edi,%eax
+	movl	%ecx,12(%esp)
+	mull	%edi
+	addl	32(%esp,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%eax,32(%esp,%ecx,4)
+	xorl	%ebp,%ebp
+	cmpl	%ebx,%ecx
+	leal	1(%ecx),%ecx
+	je	.L015sqrlast
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+.align	16
+.L016sqradd:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	(%eax,%eax,1),%ebp
+	adcl	$0,%edx
+	shrl	$31,%eax
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%eax
+	addl	%ebx,%ebp
+	adcl	$0,%eax
+	cmpl	(%esp),%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%eax,%ebx
+	jle	.L016sqradd
+	movl	%edx,%ebp
+	addl	%edx,%edx
+	shrl	$31,%ebp
+	addl	%ebx,%edx
+	adcl	$0,%ebp
+.L015sqrlast:
+	movl	20(%esp),%edi
+	movl	16(%esp),%esi
+	imull	32(%esp),%edi
+	addl	32(%esp,%ecx,4),%edx
+	movl	(%esi),%eax
+	adcl	$0,%ebp
+	movl	%edx,32(%esp,%ecx,4)
+	movl	%ebp,36(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	leal	-1(%ecx),%ebx
+	adcl	$0,%edx
+	movl	$1,%ecx
+	movl	4(%esi),%eax
+	jmp	.L0143rdmadd
+.align	16
+.L008common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L017sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L017sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L018copy
+.align	16
+.L018copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	.L018copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+.L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
new file mode 100644
index 0000000..cd77529
--- /dev/null
+++ b/gen/bcm/x86-mont-win.asm
@@ -0,0 +1,490 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_OPENSSL_ia32cap_P
+global	_bn_mul_mont
+align	16
+_bn_mul_mont:
+L$_bn_mul_mont_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	xor	eax,eax
+	mov	edi,DWORD [40+esp]
+	cmp	edi,4
+	jl	NEAR L$000just_leave
+	lea	esi,[20+esp]
+	lea	edx,[24+esp]
+	add	edi,2
+	neg	edi
+	lea	ebp,[edi*4+esp-32]
+	neg	edi
+	mov	eax,ebp
+	sub	eax,edx
+	and	eax,2047
+	sub	ebp,eax
+	xor	edx,ebp
+	and	edx,2048
+	xor	edx,2048
+	sub	ebp,edx
+	and	ebp,-64
+	mov	eax,esp
+	sub	eax,ebp
+	and	eax,-4096
+	mov	edx,esp
+	lea	esp,[eax*1+ebp]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$001page_walk
+	jmp	NEAR L$002page_walk_done
+align	16
+L$001page_walk:
+	lea	esp,[esp-4096]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$001page_walk
+L$002page_walk_done:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	ebp,DWORD [12+esi]
+	mov	esi,DWORD [16+esi]
+	mov	esi,DWORD [esi]
+	mov	DWORD [4+esp],eax
+	mov	DWORD [8+esp],ebx
+	mov	DWORD [12+esp],ecx
+	mov	DWORD [16+esp],ebp
+	mov	DWORD [20+esp],esi
+	lea	ebx,[edi-3]
+	mov	DWORD [24+esp],edx
+	lea	eax,[_OPENSSL_ia32cap_P]
+	bt	DWORD [eax],26
+	jnc	NEAR L$003non_sse2
+	mov	eax,-1
+	movd	mm7,eax
+	mov	esi,DWORD [8+esp]
+	mov	edi,DWORD [12+esp]
+	mov	ebp,DWORD [16+esp]
+	xor	edx,edx
+	xor	ecx,ecx
+	movd	mm4,DWORD [edi]
+	movd	mm5,DWORD [esi]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	movq	mm2,mm5
+	movq	mm0,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	inc	ecx
+align	16
+L$0041st:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	lea	ecx,[1+ecx]
+	cmp	ecx,ebx
+	jl	NEAR L$0041st
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm3,mm2
+	movq	[32+ebx*4+esp],mm3
+	inc	edx
+L$005outer:
+	xor	ecx,ecx
+	movd	mm4,DWORD [edx*4+edi]
+	movd	mm5,DWORD [esi]
+	movd	mm6,DWORD [32+esp]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	paddq	mm5,mm6
+	movq	mm0,mm5
+	movq	mm2,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm6,DWORD [36+esp]
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	inc	ecx
+	dec	ebx
+L$006inner:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	movd	mm6,DWORD [36+ecx*4+esp]
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	dec	ebx
+	lea	ecx,[1+ecx]
+	jnz	NEAR L$006inner
+	mov	ebx,ecx
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	movd	mm6,DWORD [36+ebx*4+esp]
+	paddq	mm3,mm2
+	paddq	mm3,mm6
+	movq	[32+ebx*4+esp],mm3
+	lea	edx,[1+edx]
+	cmp	edx,ebx
+	jle	NEAR L$005outer
+	emms
+	jmp	NEAR L$007common_tail
+align	16
+L$003non_sse2:
+	mov	esi,DWORD [8+esp]
+	lea	ebp,[1+ebx]
+	mov	edi,DWORD [12+esp]
+	xor	ecx,ecx
+	mov	edx,esi
+	and	ebp,1
+	sub	edx,edi
+	lea	eax,[4+ebx*4+edi]
+	or	ebp,edx
+	mov	edi,DWORD [edi]
+	jz	NEAR L$008bn_sqr_mont
+	mov	DWORD [28+esp],eax
+	mov	eax,DWORD [esi]
+	xor	edx,edx
+align	16
+L$009mull:
+	mov	ebp,edx
+	mul	edi
+	add	ebp,eax
+	lea	ecx,[1+ecx]
+	adc	edx,0
+	mov	eax,DWORD [ecx*4+esi]
+	cmp	ecx,ebx
+	mov	DWORD [28+ecx*4+esp],ebp
+	jl	NEAR L$009mull
+	mov	ebp,edx
+	mul	edi
+	mov	edi,DWORD [20+esp]
+	add	eax,ebp
+	mov	esi,DWORD [16+esp]
+	adc	edx,0
+	imul	edi,DWORD [32+esp]
+	mov	DWORD [32+ebx*4+esp],eax
+	xor	ecx,ecx
+	mov	DWORD [36+ebx*4+esp],edx
+	mov	DWORD [40+ebx*4+esp],ecx
+	mov	eax,DWORD [esi]
+	mul	edi
+	add	eax,DWORD [32+esp]
+	mov	eax,DWORD [4+esi]
+	adc	edx,0
+	inc	ecx
+	jmp	NEAR L$0102ndmadd
+align	16
+L$0111stmadd:
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [32+ecx*4+esp]
+	lea	ecx,[1+ecx]
+	adc	edx,0
+	add	ebp,eax
+	mov	eax,DWORD [ecx*4+esi]
+	adc	edx,0
+	cmp	ecx,ebx
+	mov	DWORD [28+ecx*4+esp],ebp
+	jl	NEAR L$0111stmadd
+	mov	ebp,edx
+	mul	edi
+	add	eax,DWORD [32+ebx*4+esp]
+	mov	edi,DWORD [20+esp]
+	adc	edx,0
+	mov	esi,DWORD [16+esp]
+	add	ebp,eax
+	adc	edx,0
+	imul	edi,DWORD [32+esp]
+	xor	ecx,ecx
+	add	edx,DWORD [36+ebx*4+esp]
+	mov	DWORD [32+ebx*4+esp],ebp
+	adc	ecx,0
+	mov	eax,DWORD [esi]
+	mov	DWORD [36+ebx*4+esp],edx
+	mov	DWORD [40+ebx*4+esp],ecx
+	mul	edi
+	add	eax,DWORD [32+esp]
+	mov	eax,DWORD [4+esi]
+	adc	edx,0
+	mov	ecx,1
+align	16
+L$0102ndmadd:
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [32+ecx*4+esp]
+	lea	ecx,[1+ecx]
+	adc	edx,0
+	add	ebp,eax
+	mov	eax,DWORD [ecx*4+esi]
+	adc	edx,0
+	cmp	ecx,ebx
+	mov	DWORD [24+ecx*4+esp],ebp
+	jl	NEAR L$0102ndmadd
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [32+ebx*4+esp]
+	adc	edx,0
+	add	ebp,eax
+	adc	edx,0
+	mov	DWORD [28+ebx*4+esp],ebp
+	xor	eax,eax
+	mov	ecx,DWORD [12+esp]
+	add	edx,DWORD [36+ebx*4+esp]
+	adc	eax,DWORD [40+ebx*4+esp]
+	lea	ecx,[4+ecx]
+	mov	DWORD [32+ebx*4+esp],edx
+	cmp	ecx,DWORD [28+esp]
+	mov	DWORD [36+ebx*4+esp],eax
+	je	NEAR L$007common_tail
+	mov	edi,DWORD [ecx]
+	mov	esi,DWORD [8+esp]
+	mov	DWORD [12+esp],ecx
+	xor	ecx,ecx
+	xor	edx,edx
+	mov	eax,DWORD [esi]
+	jmp	NEAR L$0111stmadd
+align	16
+L$008bn_sqr_mont:
+	mov	DWORD [esp],ebx
+	mov	DWORD [12+esp],ecx
+	mov	eax,edi
+	mul	edi
+	mov	DWORD [32+esp],eax
+	mov	ebx,edx
+	shr	edx,1
+	and	ebx,1
+	inc	ecx
+align	16
+L$012sqr:
+	mov	eax,DWORD [ecx*4+esi]
+	mov	ebp,edx
+	mul	edi
+	add	eax,ebp
+	lea	ecx,[1+ecx]
+	adc	edx,0
+	lea	ebp,[eax*2+ebx]
+	shr	eax,31
+	cmp	ecx,DWORD [esp]
+	mov	ebx,eax
+	mov	DWORD [28+ecx*4+esp],ebp
+	jl	NEAR L$012sqr
+	mov	eax,DWORD [ecx*4+esi]
+	mov	ebp,edx
+	mul	edi
+	add	eax,ebp
+	mov	edi,DWORD [20+esp]
+	adc	edx,0
+	mov	esi,DWORD [16+esp]
+	lea	ebp,[eax*2+ebx]
+	imul	edi,DWORD [32+esp]
+	shr	eax,31
+	mov	DWORD [32+ecx*4+esp],ebp
+	lea	ebp,[edx*2+eax]
+	mov	eax,DWORD [esi]
+	shr	edx,31
+	mov	DWORD [36+ecx*4+esp],ebp
+	mov	DWORD [40+ecx*4+esp],edx
+	mul	edi
+	add	eax,DWORD [32+esp]
+	mov	ebx,ecx
+	adc	edx,0
+	mov	eax,DWORD [4+esi]
+	mov	ecx,1
+align	16
+L$0133rdmadd:
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [32+ecx*4+esp]
+	adc	edx,0
+	add	ebp,eax
+	mov	eax,DWORD [4+ecx*4+esi]
+	adc	edx,0
+	mov	DWORD [28+ecx*4+esp],ebp
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [36+ecx*4+esp]
+	lea	ecx,[2+ecx]
+	adc	edx,0
+	add	ebp,eax
+	mov	eax,DWORD [ecx*4+esi]
+	adc	edx,0
+	cmp	ecx,ebx
+	mov	DWORD [24+ecx*4+esp],ebp
+	jl	NEAR L$0133rdmadd
+	mov	ebp,edx
+	mul	edi
+	add	ebp,DWORD [32+ebx*4+esp]
+	adc	edx,0
+	add	ebp,eax
+	adc	edx,0
+	mov	DWORD [28+ebx*4+esp],ebp
+	mov	ecx,DWORD [12+esp]
+	xor	eax,eax
+	mov	esi,DWORD [8+esp]
+	add	edx,DWORD [36+ebx*4+esp]
+	adc	eax,DWORD [40+ebx*4+esp]
+	mov	DWORD [32+ebx*4+esp],edx
+	cmp	ecx,ebx
+	mov	DWORD [36+ebx*4+esp],eax
+	je	NEAR L$007common_tail
+	mov	edi,DWORD [4+ecx*4+esi]
+	lea	ecx,[1+ecx]
+	mov	eax,edi
+	mov	DWORD [12+esp],ecx
+	mul	edi
+	add	eax,DWORD [32+ecx*4+esp]
+	adc	edx,0
+	mov	DWORD [32+ecx*4+esp],eax
+	xor	ebp,ebp
+	cmp	ecx,ebx
+	lea	ecx,[1+ecx]
+	je	NEAR L$014sqrlast
+	mov	ebx,edx
+	shr	edx,1
+	and	ebx,1
+align	16
+L$015sqradd:
+	mov	eax,DWORD [ecx*4+esi]
+	mov	ebp,edx
+	mul	edi
+	add	eax,ebp
+	lea	ebp,[eax*1+eax]
+	adc	edx,0
+	shr	eax,31
+	add	ebp,DWORD [32+ecx*4+esp]
+	lea	ecx,[1+ecx]
+	adc	eax,0
+	add	ebp,ebx
+	adc	eax,0
+	cmp	ecx,DWORD [esp]
+	mov	DWORD [28+ecx*4+esp],ebp
+	mov	ebx,eax
+	jle	NEAR L$015sqradd
+	mov	ebp,edx
+	add	edx,edx
+	shr	ebp,31
+	add	edx,ebx
+	adc	ebp,0
+L$014sqrlast:
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [16+esp]
+	imul	edi,DWORD [32+esp]
+	add	edx,DWORD [32+ecx*4+esp]
+	mov	eax,DWORD [esi]
+	adc	ebp,0
+	mov	DWORD [32+ecx*4+esp],edx
+	mov	DWORD [36+ecx*4+esp],ebp
+	mul	edi
+	add	eax,DWORD [32+esp]
+	lea	ebx,[ecx-1]
+	adc	edx,0
+	mov	ecx,1
+	mov	eax,DWORD [4+esi]
+	jmp	NEAR L$0133rdmadd
+align	16
+L$007common_tail:
+	mov	ebp,DWORD [16+esp]
+	mov	edi,DWORD [4+esp]
+	lea	esi,[32+esp]
+	mov	eax,DWORD [esi]
+	mov	ecx,ebx
+	xor	edx,edx
+align	16
+L$016sub:
+	sbb	eax,DWORD [edx*4+ebp]
+	mov	DWORD [edx*4+edi],eax
+	dec	ecx
+	mov	eax,DWORD [4+edx*4+esi]
+	lea	edx,[1+edx]
+	jge	NEAR L$016sub
+	sbb	eax,0
+	mov	edx,-1
+	xor	edx,eax
+	jmp	NEAR L$017copy
+align	16
+L$017copy:
+	mov	esi,DWORD [32+ebx*4+esp]
+	mov	ebp,DWORD [ebx*4+edi]
+	mov	DWORD [32+ebx*4+esp],ecx
+	and	esi,eax
+	and	ebp,edx
+	or	ebp,esi
+	mov	DWORD [ebx*4+edi],ebp
+	dec	ebx
+	jge	NEAR L$017copy
+	mov	esp,DWORD [24+esp]
+	mov	eax,1
+L$000just_leave:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+db	111,114,103,62,0
+segment	.bss
+common	_OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
new file mode 100644
index 0000000..4bf0c6d
--- /dev/null
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -0,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.globl	_bn_mul_mont_nohw
+.private_extern _bn_mul_mont_nohw
+
+.p2align	4
+_bn_mul_mont_nohw:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+	jmp	L$mul_page_walk_done
+
+.p2align	4
+L$mul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+L$mul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
+L$mul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$1st_enter
+
+.p2align	4
+L$1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	L$outer
+.p2align	4
+L$outer:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$inner_enter
+
+.p2align	4
+L$inner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$inner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$inner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	L$outer
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	movq	%r9,%r15
+
+.p2align	4
+L$sub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsp,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	L$sub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+L$copy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	L$copy
+
+	movq	8(%rsp,%r9,8),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul_epilogue:
+	ret
+
+
+.globl	_bn_mul4x_mont
+.private_extern _bn_mul4x_mont
+
+.p2align	4
+_bn_mul4x_mont:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
+L$mul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	L$1st4x
+.p2align	4
+L$1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	L$1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.p2align	2
+L$outer4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	L$inner4x
+.p2align	4
+L$inner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	L$inner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jb	L$outer4x
+	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdx
+	shrq	$2,%r15
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+
+L$sub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	L$sub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
+
+	jmp	L$copy4x
+.p2align	4
+L$copy4x:
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
+	decq	%r15
+	jnz	L$copy4x
+	movq	8(%rsp,%r9,8),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul4x_epilogue:
+	ret
+
+
+
+
+
+.globl	_bn_sqr8x_mont
+.private_extern _bn_sqr8x_mont
+
+.p2align	5
+_bn_sqr8x_mont:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqr8x_prologue:
+
+	movl	%r9d,%r10d
+	shll	$3,%r9d
+	shlq	$3+2,%r10
+	negq	%r9
+
+
+
+
+
+
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$sqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	L$sqr8x_sp_done
+
+.p2align	5
+L$sqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$sqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+	jmp	L$sqr8x_page_walk_done
+
+.p2align	4
+L$sqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+L$sqr8x_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$sqr8x_body:
+
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	testq	%rdx,%rdx
+	jz	L$sqr8x_nox
+
+	call	_bn_sqrx8x_internal
+
+
+
+
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	L$sqr8x_sub
+
+.p2align	5
+L$sqr8x_nox:
+	call	_bn_sqr8x_internal
+
+
+
+
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	L$sqr8x_sub
+
+.p2align	5
+L$sqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	L$sqr8x_sub
+
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
+
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+
+	jmp	L$sqr8x_cond_copy
+
+.p2align	5
+L$sqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	L$sqr8x_cond_copy
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$sqr8x_epilogue:
+	ret
+
+
+.globl	_bn_mulx4x_mont
+.private_extern _bn_mulx4x_mont
+
+.p2align	5
+_bn_mulx4x_mont:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx4x_prologue:
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+	jmp	L$mulx4x_page_walk_done
+
+.p2align	4
+L$mulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+	leaq	(%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+
+	movq	%r9,48(%rsp)
+	jmp	L$mulx4x_body
+
+.p2align	5
+L$mulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
+
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
+	addq	%rax,%r11
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
+	adcq	$0,%r13
+
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+
+	jmp	L$mulx4x_1st
+
+.p2align	5
+L$mulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_1st
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+	jmp	L$mulx4x_outer
+
+.p2align	5
+L$mulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
+
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
+
+	movq	%rdi,8(%rsp)
+	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
+
+	jmp	L$mulx4x_inner
+
+.p2align	5
+L$mulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_inner
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+
+	cmpq	16(%rsp),%rdi
+	jne	L$mulx4x_outer
+
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	L$mulx4x_sub
+
+.p2align	5
+L$mulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
+	leaq	32(%rcx),%rcx
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	L$mulx4x_sub
+
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
+
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+
+	jmp	L$mulx4x_cond_copy
+
+.p2align	5
+L$mulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	L$mulx4x_cond_copy
+
+	movq	%rdx,(%rbx)
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mulx4x_epilogue:
+	ret
+
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	4
+#endif
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
new file mode 100644
index 0000000..02b282d
--- /dev/null
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -0,0 +1,1237 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.globl	bn_mul_mont_nohw
+.hidden bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,@function
+.align	16
+bn_mul_mont_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	movq	%r9,%r15
+
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsp,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+.Lcopy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+.globl	bn_mul4x_mont
+.hidden bn_mul4x_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jb	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdx
+	shrq	$2,%r15
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
+
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
+	decq	%r15
+	jnz	.Lcopy4x
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi, 8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.extern	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.extern	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+
+.globl	bn_sqr8x_mont
+.hidden bn_sqr8x_mont
+.type	bn_sqr8x_mont,@function
+.align	32
+bn_sqr8x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lsqr8x_prologue:
+
+	movl	%r9d,%r10d
+	shll	$3,%r9d
+	shlq	$3+2,%r10
+	negq	%r9
+
+
+
+
+
+
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	.Lsqr8x_sp_done
+
+.align	32
+.Lsqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lsqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lsqr8x_body:
+
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	testq	%rdx,%rdx
+	jz	.Lsqr8x_nox
+
+	call	bn_sqrx8x_internal
+
+
+
+
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_nox:
+	call	bn_sqr8x_internal
+
+
+
+
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	.Lsqr8x_sub
+
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
+
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	.Lsqr8x_cond_copy
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr8x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+.globl	bn_mulx4x_mont
+.hidden bn_mulx4x_mont
+.type	bn_mulx4x_mont,@function
+.align	32
+bn_mulx4x_mont:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.align	16
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+	leaq	(%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+	movq	%r9,48(%rsp)
+	jmp	.Lmulx4x_body
+
+.align	32
+.Lmulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
+
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
+	addq	%rax,%r11
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
+	adcq	$0,%r13
+
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
+
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
+
+	movq	%rdi,8(%rsp)
+	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+
+	cmpq	16(%rsp),%rdi
+	jne	.Lmulx4x_outer
+
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	.Lmulx4x_sub
+
+.align	32
+.Lmulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
+	leaq	32(%rcx),%rcx
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	.Lmulx4x_sub
+
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
+
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	movq	%rdx,(%rbx)
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
+#endif
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
new file mode 100644
index 0000000..b0611fc
--- /dev/null
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -0,0 +1,1470 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+global	bn_mul_mont_nohw
+
+ALIGN	16
+bn_mul_mont_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul_mont_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-16))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+ALIGN	16
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$1st_enter
+
+ALIGN	16
+$L$1st:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r11
+	mov	r11,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$1st_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[1+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$1st
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+	mov	r11,r10
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	jmp	NEAR $L$outer
+ALIGN	16
+$L$outer:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	rbp,r8
+	mov	r10,QWORD[rsp]
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r10,QWORD[8+rsp]
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$inner_enter
+
+ALIGN	16
+$L$inner:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$inner_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+	lea	r15,[1+r15]
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$inner
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	cmp	r14,r9
+	jb	NEAR $L$outer
+
+	xor	r14,r14
+	mov	rax,QWORD[rsp]
+	mov	r15,r9
+
+ALIGN	16
+$L$sub:	sbb	rax,QWORD[r14*8+rcx]
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[8+r14*8+rsp]
+	lea	r14,[1+r14]
+	dec	r15
+	jnz	NEAR $L$sub
+
+	sbb	rax,0
+	mov	rbx,-1
+	xor	rbx,rax
+	xor	r14,r14
+	mov	r15,r9
+
+$L$copy:
+	mov	rcx,QWORD[r14*8+rdi]
+	mov	rdx,QWORD[r14*8+rsp]
+	and	rcx,rbx
+	and	rdx,rax
+	mov	QWORD[r14*8+rsp],r9
+	or	rdx,rcx
+	mov	QWORD[r14*8+rdi],rdx
+	lea	r14,[1+r14]
+	sub	r15,1
+	jnz	NEAR $L$copy
+
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul_mont_nohw:
+global	bn_mul4x_mont
+
+ALIGN	16
+bn_mul4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-32))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul4x_body:
+	mov	QWORD[16+r9*8+rsp],rdi
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+ALIGN	16
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	lea	r14,[1+r14]
+ALIGN	4
+$L$outer4x:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	r10,QWORD[rsp]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+ALIGN	16
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r15*8+rsp]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	lea	r14,[1+r14]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r9*8+rsp]
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	cmp	r14,r9
+	jb	NEAR $L$outer4x
+	mov	rdi,QWORD[16+r9*8+rsp]
+	lea	r15,[((-4))+r9]
+	mov	rax,QWORD[rsp]
+	mov	rdx,QWORD[8+rsp]
+	shr	r15,2
+	lea	rsi,[rsp]
+	xor	r14,r14
+
+	sub	rax,QWORD[rcx]
+	mov	rbx,QWORD[16+rsi]
+	mov	rbp,QWORD[24+rsi]
+	sbb	rdx,QWORD[8+rcx]
+
+$L$sub4x:
+	mov	QWORD[r14*8+rdi],rax
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	rax,QWORD[32+r14*8+rsi]
+	mov	rdx,QWORD[40+r14*8+rsi]
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+	mov	QWORD[24+r14*8+rdi],rbp
+	sbb	rax,QWORD[32+r14*8+rcx]
+	mov	rbx,QWORD[48+r14*8+rsi]
+	mov	rbp,QWORD[56+r14*8+rsi]
+	sbb	rdx,QWORD[40+r14*8+rcx]
+	lea	r14,[4+r14]
+	dec	r15
+	jnz	NEAR $L$sub4x
+
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[32+r14*8+rsi]
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+
+	sbb	rax,0
+	mov	QWORD[24+r14*8+rdi],rbp
+	pxor	xmm0,xmm0
+DB	102,72,15,110,224
+	pcmpeqd	xmm5,xmm5
+	pshufd	xmm4,xmm4,0
+	mov	r15,r9
+	pxor	xmm5,xmm4
+	shr	r15,2
+	xor	eax,eax
+
+	jmp	NEAR $L$copy4x
+ALIGN	16
+$L$copy4x:
+	movdqa	xmm1,XMMWORD[rax*1+rsp]
+	movdqu	xmm2,XMMWORD[rax*1+rdi]
+	pand	xmm1,xmm4
+	pand	xmm2,xmm5
+	movdqa	xmm3,XMMWORD[16+rax*1+rsp]
+	movdqa	XMMWORD[rax*1+rsp],xmm0
+	por	xmm1,xmm2
+	movdqu	xmm2,XMMWORD[16+rax*1+rdi]
+	movdqu	XMMWORD[rax*1+rdi],xmm1
+	pand	xmm3,xmm4
+	pand	xmm2,xmm5
+	movdqa	XMMWORD[16+rax*1+rsp],xmm0
+	por	xmm3,xmm2
+	movdqu	XMMWORD[16+rax*1+rdi],xmm3
+	lea	rax,[32+rax]
+	dec	r15
+	jnz	NEAR $L$copy4x
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul4x_mont:
+EXTERN	bn_sqrx8x_internal
+EXTERN	bn_sqr8x_internal
+
+global	bn_sqr8x_mont
+
+ALIGN	32
+bn_sqr8x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_sqr8x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr8x_prologue:
+
+	mov	r10d,r9d
+	shl	r9d,3
+	shl	r10,3+2
+	neg	r9
+
+
+
+
+
+
+	lea	r11,[((-64))+r9*2+rsp]
+	mov	rbp,rsp
+	mov	r8,QWORD[r8]
+	sub	r11,rsi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$sqr8x_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-64))+r9*2+rbp]
+	jmp	NEAR $L$sqr8x_sp_done
+
+ALIGN	32
+$L$sqr8x_sp_alt:
+	lea	r10,[((4096-64))+r9*2]
+	lea	rbp,[((-64))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$sqr8x_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+	jmp	NEAR $L$sqr8x_page_walk_done
+
+ALIGN	16
+$L$sqr8x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$sqr8x_body:
+
+DB	102,72,15,110,209
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,73,15,110,218
+	test	rdx,rdx
+	jz	NEAR $L$sqr8x_nox
+
+	call	bn_sqrx8x_internal
+
+
+
+
+	lea	rbx,[rcx*1+r8]
+	mov	r9,rcx
+	mov	rdx,rcx
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_nox:
+	call	bn_sqr8x_internal
+
+
+
+
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+	mov	rdx,r9
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_sub:
+	mov	r12,QWORD[rbx]
+	mov	r13,QWORD[8+rbx]
+	mov	r14,QWORD[16+rbx]
+	mov	r15,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r12,QWORD[rbp]
+	sbb	r13,QWORD[8+rbp]
+	sbb	r14,QWORD[16+rbp]
+	sbb	r15,QWORD[24+rbp]
+	lea	rbp,[32+rbp]
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+	inc	rcx
+	jnz	NEAR $L$sqr8x_sub
+
+	sbb	rax,0
+	lea	rbx,[r9*1+rbx]
+	lea	rdi,[r9*1+rdi]
+
+DB	102,72,15,110,200
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$sqr8x_cond_copy
+
+ALIGN	32
+$L$sqr8x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	movdqa	XMMWORD[(-32)+rdx*1+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rdx*1+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	add	r9,32
+	jnz	NEAR $L$sqr8x_cond_copy
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$sqr8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_sqr8x_mont:
+global	bn_mulx4x_mont
+
+ALIGN	32
+bn_mulx4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+	shl	r9d,3
+	xor	r10,r10
+	sub	r10,r9
+	mov	r8,QWORD[r8]
+	lea	rbp,[((-72))+r10*1+rsp]
+	and	rbp,-128
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+ALIGN	16
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+	lea	r10,[r9*1+rdx]
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[rsp],r9
+	shr	r9,5
+	mov	QWORD[16+rsp],r10
+	sub	r9,1
+	mov	QWORD[24+rsp],r8
+	mov	QWORD[32+rsp],rdi
+	mov	QWORD[40+rsp],rax
+
+	mov	QWORD[48+rsp],r9
+	jmp	NEAR $L$mulx4x_body
+
+ALIGN	32
+$L$mulx4x_body:
+	lea	rdi,[8+rdx]
+	mov	rdx,QWORD[rdx]
+	lea	rbx,[((64+32))+rsp]
+	mov	r9,rdx
+
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r14,r11,QWORD[8+rsi]
+	add	r11,rax
+	mov	QWORD[8+rsp],rdi
+	mulx	r13,r12,QWORD[16+rsi]
+	adc	r12,r14
+	adc	r13,0
+
+	mov	rdi,r8
+	imul	r8,QWORD[24+rsp]
+	xor	rbp,rbp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	rdi,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	DB	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	add	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	mov	rdx,QWORD[rdi]
+	lea	rdi,[8+rdi]
+	sub	rsi,rax
+	mov	QWORD[rbx],r15
+	lea	rbx,[((64+32))+rsp]
+	sub	rcx,rax
+
+	mulx	r11,r8,QWORD[rsi]
+	xor	ebp,ebp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rbp
+	adox	r13,rbp
+
+	mov	QWORD[8+rsp],rdi
+	mov	r15,r8
+	imul	r8,QWORD[24+rsp]
+	xor	ebp,ebp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r13,rax
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	adox	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	lea	rcx,[32+rcx]
+	adcx	r12,rax
+	adox	r15,rbp
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-32))+rbx],r11
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	sub	rbp,QWORD[rbx]
+	adc	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,QWORD[16+rsp]
+	jne	NEAR $L$mulx4x_outer
+
+	lea	rbx,[64+rsp]
+	sub	rcx,rax
+	neg	r15
+	mov	rdx,rax
+	shr	rax,3+2
+	mov	rdi,QWORD[32+rsp]
+	jmp	NEAR $L$mulx4x_sub
+
+ALIGN	32
+$L$mulx4x_sub:
+	mov	r11,QWORD[rbx]
+	mov	r12,QWORD[8+rbx]
+	mov	r13,QWORD[16+rbx]
+	mov	r14,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r11,QWORD[rcx]
+	sbb	r12,QWORD[8+rcx]
+	sbb	r13,QWORD[16+rcx]
+	sbb	r14,QWORD[24+rcx]
+	lea	rcx,[32+rcx]
+	mov	QWORD[rdi],r11
+	mov	QWORD[8+rdi],r12
+	mov	QWORD[16+rdi],r13
+	mov	QWORD[24+rdi],r14
+	lea	rdi,[32+rdi]
+	dec	rax
+	jnz	NEAR $L$mulx4x_sub
+
+	sbb	r15,0
+	lea	rbx,[64+rsp]
+	sub	rdi,rdx
+
+DB	102,73,15,110,207
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$mulx4x_cond_copy
+
+ALIGN	32
+$L$mulx4x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	sub	rdx,32
+	jnz	NEAR $L$mulx4x_cond_copy
+
+	mov	QWORD[rbx],rdx
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mulx4x_mont:
+	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+	DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+	DB	54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
+	DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+	DB	115,108,46,111,114,103,62,0
+ALIGN	16
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+
+
+ALIGN	16
+sqr_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[40+rax]
+
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_end_bn_mul_mont_nohw wrt ..imagebase
+	DD	$L$SEH_info_bn_mul_mont_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_bn_mul_mont_nohw:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+$L$SEH_info_bn_mul4x_mont:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+$L$SEH_info_bn_sqr8x_mont:
+	DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont:
+	DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
new file mode 100644
index 0000000..bd63d91
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -0,0 +1,3624 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+.globl	_bn_mul_mont_gather5
+.private_extern _bn_mul_mont_gather5
+
+.p2align	6
+_bn_mul_mont_gather5:
+
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
+	testl	$7,%r9d
+	jnz	L$mul_enter
+	leaq	_OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	jmp	L$mul4x_enter
+
+.p2align	4
+L$mul_enter:
+	movd	8(%rsp),%xmm5
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+	jmp	L$mul_page_walk_done
+
+L$mul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+L$mul_page_walk_done:
+
+	leaq	L$inc(%rip),%r10
+	movq	%rax,8(%rsp,%r9,8)
+
+L$mul_body:
+
+	leaq	128(%rdx),%r12
+	movdqa	0(%r10),%xmm0
+	movdqa	16(%r10),%xmm1
+	leaq	24-112(%rsp,%r9,8),%r10
+	andq	$-16,%r10
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$1st_enter
+
+.p2align	4
+L$1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$1st
+
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	L$outer
+.p2align	4
+L$outer:
+	leaq	24+128(%rsp,%r9,8),%rdx
+	andq	$-16,%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+
+	movq	(%rsi),%rax
+.byte	102,72,15,126,195
+
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	L$inner_enter
+
+.p2align	4
+L$inner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+L$inner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	L$inner
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r9,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	L$outer
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	L$sub
+.p2align	4
+L$sub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	L$sub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+L$copy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r14,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	L$copy
+
+	movq	8(%rsp,%r9,8),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul_epilogue:
+	ret
+
+
+
+.p2align	5
+bn_mul4x_mont_gather5:
+
+.byte	0x67
+	movq	%rsp,%rax
+
+L$mul4x_enter:
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	L$mulx4x_enter
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mul4x_prologue:
+
+.byte	0x67
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$mul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$mul4xsp_done
+
+.p2align	5
+L$mul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$mul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+
+L$mul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mul4x_epilogue:
+	ret
+
+
+
+
+.p2align	5
+mul4x_internal:
+
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	L$inc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
+	leaq	128(%rdx),%r12
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+	jmp	L$1st4x
+
+.p2align	5
+L$1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	L$1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	jmp	L$outer4x
+
+.p2align	5
+L$outer4x:
+	leaq	16+128(%r14),%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r14,%r9,1),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+	movq	%rdi,(%r14)
+
+	leaq	(%r14,%r9,1),%r14
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	L$inner4x
+
+.p2align	5
+L$inner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	addq	(%r14),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	L$inner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%r14),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	cmpq	16+8(%rsp),%r12
+	jb	L$outer4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqr4x_sub_entry
+
+
+.globl	_bn_power5
+.private_extern _bn_power5
+
+.p2align	5
+_bn_power5:
+
+_CET_ENDBR
+	movq	%rsp,%rax
+
+	leaq	_OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	L$powerx5_enter
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$power5_prologue:
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$pwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$pwr_sp_done
+
+.p2align	5
+L$pwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$pwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+	jmp	L$pwr_page_walk_done
+
+L$pwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+L$pwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$power5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$power5_epilogue:
+	ret
+
+
+
+.globl	_bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+.private_extern	_bn_sqr8x_internal
+
+.p2align	5
+_bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	L$sqr4x_1st
+
+.p2align	5
+L$sqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	L$sqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	L$sqr4x_outer
+
+.p2align	5
+L$sqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	L$sqr4x_inner
+
+.p2align	5
+L$sqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	L$sqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	L$sqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	L$sqr4x_shift_n_add
+
+.p2align	5
+L$sqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	L$sqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	L$8x_reduction_loop
+
+.p2align	5
+L$8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	L$8x_reduce
+
+.p2align	5
+L$8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	L$8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	L$8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	L$8x_tail
+
+.p2align	5
+L$8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	L$8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	L$8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	L$8x_tail
+
+.p2align	5
+L$8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+L$8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	L$8x_reduction_loop
+	ret
+
+
+
+.p2align	5
+__bn_post4x_internal:
+
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqr4x_sub_entry
+
+.p2align	4
+L$sqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+L$sqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
+
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+
+	incq	%rcx
+	jnz	L$sqr4x_sub
+
+	movq	%r9,%r10
+	negq	%r9
+	ret
+
+
+
+.p2align	5
+bn_mulx4x_mont_gather5:
+
+	movq	%rsp,%rax
+
+L$mulx4x_enter:
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx4x_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$mulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$mulx4xsp_done
+
+L$mulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$mulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+	jmp	L$mulx4x_page_walk_done
+
+L$mulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$mulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$mulx4x_epilogue:
+	ret
+
+
+
+
+.p2align	5
+mulx4x_internal:
+
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	L$inc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	L$mulx4x_1st
+
+.p2align	5
+L$mulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	L$mulx4x_outer
+
+.p2align	5
+L$mulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	L$mulx4x_inner
+
+.p2align	5
+L$mulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	L$mulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	L$mulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqrx4x_sub_entry
+
+
+
+.p2align	5
+bn_powerx5:
+
+	movq	%rsp,%rax
+
+L$powerx5_enter:
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$powerx5_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	L$pwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	L$pwrx_sp_done
+
+.p2align	5
+L$pwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+L$pwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwrx_page_walk
+	jmp	L$pwrx_page_walk_done
+
+L$pwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwrx_page_walk
+L$pwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+
+L$powerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$powerx5_epilogue:
+	ret
+
+
+
+.globl	_bn_sqrx8x_internal
+.private_extern _bn_sqrx8x_internal
+.private_extern	_bn_sqrx8x_internal
+
+.p2align	5
+_bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	L$sqr8x_zero_start
+
+.p2align	5
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+L$sqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+L$sqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	L$sqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	L$sqrx8x_outer_loop
+
+.p2align	5
+L$sqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	L$sqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	L$sqrx8x_loop
+
+.p2align	5
+L$sqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	L$sqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	L$sqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_loop
+
+.p2align	5
+L$sqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	L$sqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	L$sqrx8x_outer_loop
+
+.p2align	5
+L$sqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.p2align	5
+L$sqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	L$sqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	L$sqrx4x_shift_n_add
+
+.p2align	5
+L$sqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	L$sqrx8x_reduction_loop
+
+.p2align	5
+L$sqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	L$sqrx8x_reduce
+
+.p2align	5
+L$sqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	L$sqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	L$sqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_tail
+
+.p2align	5
+L$sqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	L$sqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	L$sqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	L$sqrx8x_tail
+
+.p2align	5
+L$sqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+L$sqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	L$sqrx8x_reduction_loop
+	ret
+
+
+.p2align	5
+
+__bn_postx4x_internal:
+
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	L$sqrx4x_sub_entry
+
+.p2align	4
+L$sqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+L$sqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	L$sqrx4x_sub
+
+	negq	%r9
+
+	ret
+
+
+.globl	_bn_scatter5
+.private_extern _bn_scatter5
+
+.p2align	4
+_bn_scatter5:
+
+_CET_ENDBR
+	cmpl	$0,%esi
+	jz	L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+	leaq	(%rdx,%rcx,8),%rdx
+L$scatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subl	$1,%esi
+	jnz	L$scatter
+L$scatter_epilogue:
+	ret
+
+
+
+.globl	_bn_gather5
+.private_extern _bn_gather5
+
+.p2align	5
+_bn_gather5:
+
+L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte	0x4c,0x8d,0x14,0x24
+
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	leaq	L$inc(%rip),%rax
+	andq	$-16,%rsp
+
+	movd	%ecx,%xmm5
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	128(%rdx),%r11
+	leaq	128(%rsp),%rax
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-128(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-112(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-96(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-80(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-48(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-16(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,16(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,48(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,80(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,96(%rax)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm3,112(%rax)
+	jmp	L$gather
+
+.p2align	5
+L$gather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r11),%xmm0
+	movdqa	-112(%r11),%xmm1
+	movdqa	-96(%r11),%xmm2
+	pand	-128(%rax),%xmm0
+	movdqa	-80(%r11),%xmm3
+	pand	-112(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r11),%xmm0
+	movdqa	-48(%r11),%xmm1
+	movdqa	-32(%r11),%xmm2
+	pand	-64(%rax),%xmm0
+	movdqa	-16(%r11),%xmm3
+	pand	-48(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	pand	0(%rax),%xmm0
+	movdqa	48(%r11),%xmm3
+	pand	16(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r11),%xmm0
+	movdqa	80(%r11),%xmm1
+	movdqa	96(%r11),%xmm2
+	pand	64(%rax),%xmm0
+	movdqa	112(%r11),%xmm3
+	pand	80(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	leaq	256(%r11),%r11
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subl	$1,%esi
+	jnz	L$gather
+
+	leaq	(%r10),%rsp
+
+	ret
+L$SEH_end_bn_gather5:
+
+
+.section	__DATA,__const
+.p2align	6
+L$inc:
+.long	0,0, 1,1
+.long	2,2, 2,2
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+#endif
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
new file mode 100644
index 0000000..14ab4f7
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -0,0 +1,3625 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl	bn_mul_mont_gather5
+.hidden bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+.cfi_startproc	
+_CET_ENDBR
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	testl	$7,%r9d
+	jnz	.Lmul_enter
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movd	8(%rsp),%xmm5
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	leaq	.Linc(%rip),%r10
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+
+	leaq	128(%rdx),%r12
+	movdqa	0(%r10),%xmm0
+	movdqa	16(%r10),%xmm1
+	leaq	24-112(%rsp,%r9,8),%r10
+	andq	$-16,%r10
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	leaq	24+128(%rsp,%r9,8),%rdx
+	andq	$-16,%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+
+	movq	(%rsi),%rax
+.byte	102,72,15,126,195
+
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r9,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+.Lcopy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r14,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	32
+bn_mul4x_mont_gather5:
+.cfi_startproc	
+.byte	0x67
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmul4x_enter:
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lmulx4x_enter
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmul4x_prologue:
+
+.byte	0x67
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal,@function
+.align	32
+mul4x_internal:
+.cfi_startproc	
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	.Linc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
+	leaq	128(%rdx),%r12
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+
+.align	32
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	jmp	.Louter4x
+
+.align	32
+.Louter4x:
+	leaq	16+128(%r14),%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r14,%r9,1),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+	movq	%rdi,(%r14)
+
+	leaq	(%r14,%r9,1),%r14
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+
+.align	32
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	addq	(%r14),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%r14),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	cmpq	16+8(%rsp),%r12
+	jb	.Louter4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+.cfi_endproc	
+.size	mul4x_internal,.-mul4x_internal
+.globl	bn_power5
+.hidden bn_power5
+.type	bn_power5,@function
+.align	32
+bn_power5:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lpowerx5_enter
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpower5_prologue:
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpower5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpower5_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_power5,.-bn_power5
+
+.globl	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal,@function
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+.L8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	.L8x_reduction_loop
+	ret
+.cfi_endproc	
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+.type	__bn_post4x_internal,@function
+.align	32
+__bn_post4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+
+.align	16
+.Lsqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
+
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+
+	incq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%r9,%r10
+	negq	%r9
+	ret
+.cfi_endproc	
+.size	__bn_post4x_internal,.-__bn_post4x_internal
+.type	bn_mulx4x_mont_gather5,@function
+.align	32
+bn_mulx4x_mont_gather5:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmulx4x_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal,@function
+.align	32
+mulx4x_internal:
+.cfi_startproc	
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	.Linc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	.Lmulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+.cfi_endproc	
+.size	mulx4x_internal,.-mulx4x_internal
+.type	bn_powerx5,@function
+.align	32
+bn_powerx5:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lpowerx5_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpowerx5_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpowerx5_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal,@function
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	.Lsqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	.Lsqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	.Lsqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	.Lsqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	.Lsqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	.Lsqrx8x_reduction_loop
+	ret
+.cfi_endproc	
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align	32
+.type	__bn_postx4x_internal,@function
+__bn_postx4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+
+.align	16
+.Lsqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	.Lsqrx4x_sub
+
+	negq	%r9
+
+	ret
+.cfi_endproc	
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
+.globl	bn_scatter5
+.hidden bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+.cfi_startproc	
+_CET_ENDBR
+	cmpl	$0,%esi
+	jz	.Lscatter_epilogue
+
+
+
+
+
+
+
+
+
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subl	$1,%esi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.cfi_endproc	
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.hidden bn_gather5
+.type	bn_gather5,@function
+.align	32
+bn_gather5:
+.cfi_startproc	
+.LSEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte	0x4c,0x8d,0x14,0x24
+.cfi_def_cfa_register	%r10
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	leaq	.Linc(%rip),%rax
+	andq	$-16,%rsp
+
+	movd	%ecx,%xmm5
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	128(%rdx),%r11
+	leaq	128(%rsp),%rax
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-128(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-112(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-96(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-80(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-48(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-16(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,16(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,48(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,80(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,96(%rax)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm3,112(%rax)
+	jmp	.Lgather
+
+.align	32
+.Lgather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r11),%xmm0
+	movdqa	-112(%r11),%xmm1
+	movdqa	-96(%r11),%xmm2
+	pand	-128(%rax),%xmm0
+	movdqa	-80(%r11),%xmm3
+	pand	-112(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r11),%xmm0
+	movdqa	-48(%r11),%xmm1
+	movdqa	-32(%r11),%xmm2
+	pand	-64(%rax),%xmm0
+	movdqa	-16(%r11),%xmm3
+	pand	-48(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	pand	0(%rax),%xmm0
+	movdqa	48(%r11),%xmm3
+	pand	16(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r11),%xmm0
+	movdqa	80(%r11),%xmm1
+	movdqa	96(%r11),%xmm2
+	pand	64(%rax),%xmm0
+	movdqa	112(%r11),%xmm3
+	pand	80(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	leaq	256(%r11),%r11
+
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subl	$1,%esi
+	jnz	.Lgather
+
+	leaq	(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+	ret
+.LSEH_end_bn_gather5:
+.cfi_endproc	
+.size	bn_gather5,.-bn_gather5
+.section	.rodata
+.align	64
+.Linc:
+.long	0,0, 1,1
+.long	2,2, 2,2
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+#endif
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
new file mode 100644
index 0000000..46aae51
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -0,0 +1,3864 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+EXTERN	OPENSSL_ia32cap_P
+
+global	bn_mul_mont_gather5
+
+ALIGN	64
+bn_mul_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	test	r9d,7
+	jnz	NEAR $L$mul_enter
+	lea	r11,[OPENSSL_ia32cap_P]
+	mov	r11d,DWORD[8+r11]
+	jmp	NEAR $L$mul4x_enter
+
+ALIGN	16
+$L$mul_enter:
+	movd	xmm5,DWORD[56+rsp]
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-280))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	lea	r10,[$L$inc]
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+
+	lea	r12,[128+rdx]
+	movdqa	xmm0,XMMWORD[r10]
+	movdqa	xmm1,XMMWORD[16+r10]
+	lea	r10,[((24-112))+r9*8+rsp]
+	and	r10,-16
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	DB	0x67
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	DB	0x67
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+	pand	xmm0,XMMWORD[64+r12]
+
+	pand	xmm1,XMMWORD[80+r12]
+	pand	xmm2,XMMWORD[96+r12]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+r12]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+r12]
+	movdqa	xmm5,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+r12]
+	movdqa	xmm5,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[r12]
+	movdqa	xmm5,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	por	xmm0,xmm1
+
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	r8,QWORD[r8]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$1st_enter
+
+ALIGN	16
+$L$1st:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r11
+	mov	r11,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$1st_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[1+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$1st
+
+
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r9*8+rsp],r13
+	mov	r13,rdx
+	mov	r11,r10
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	jmp	NEAR $L$outer
+ALIGN	16
+$L$outer:
+	lea	rdx,[((24+128))+r9*8+rsp]
+	and	rdx,-16
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r12]
+	movdqa	xmm1,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm0,XMMWORD[((-128))+rdx]
+	pand	xmm1,XMMWORD[((-112))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r12]
+	movdqa	xmm1,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm0,XMMWORD[((-64))+rdx]
+	pand	xmm1,XMMWORD[((-48))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r12]
+	movdqa	xmm1,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm0,XMMWORD[rdx]
+	pand	xmm1,XMMWORD[16+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r12]
+	movdqa	xmm1,XMMWORD[80+r12]
+	movdqa	xmm2,XMMWORD[96+r12]
+	movdqa	xmm3,XMMWORD[112+r12]
+	pand	xmm0,XMMWORD[64+rdx]
+	pand	xmm1,XMMWORD[80+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	r12,[256+r12]
+
+	mov	rax,QWORD[rsi]
+DB	102,72,15,126,195
+
+	xor	r15,r15
+	mov	rbp,r8
+	mov	r10,QWORD[rsp]
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r10,QWORD[8+rsp]
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$inner_enter
+
+ALIGN	16
+$L$inner:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$inner_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+	lea	r15,[1+r15]
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$inner
+
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r9*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r9*8+rsp],r13
+	mov	r13,rdx
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	cmp	r14,r9
+	jb	NEAR $L$outer
+
+	xor	r14,r14
+	mov	rax,QWORD[rsp]
+	lea	rsi,[rsp]
+	mov	r15,r9
+	jmp	NEAR $L$sub
+ALIGN	16
+$L$sub:	sbb	rax,QWORD[r14*8+rcx]
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[8+r14*8+rsi]
+	lea	r14,[1+r14]
+	dec	r15
+	jnz	NEAR $L$sub
+
+	sbb	rax,0
+	mov	rbx,-1
+	xor	rbx,rax
+	xor	r14,r14
+	mov	r15,r9
+
+$L$copy:
+	mov	rcx,QWORD[r14*8+rdi]
+	mov	rdx,QWORD[r14*8+rsp]
+	and	rcx,rbx
+	and	rdx,rax
+	mov	QWORD[r14*8+rsp],r14
+	or	rdx,rcx
+	mov	QWORD[r14*8+rdi],rdx
+	lea	r14,[1+r14]
+	sub	r15,1
+	jnz	NEAR $L$copy
+
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul_mont_gather5:
+
+ALIGN	32
+bn_mul4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	DB	0x67
+	mov	rax,rsp
+
+$L$mul4x_enter:
+	and	r11d,0x80108
+	cmp	r11d,0x80108
+	je	NEAR $L$mulx4x_enter
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul4x_prologue:
+
+	DB	0x67
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mul4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mul4xsp_done
+
+ALIGN	32
+$L$mul4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mul4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	neg	r9
+
+	mov	QWORD[40+rsp],rax
+
+$L$mul4x_body:
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mul4x_mont_gather5:
+
+
+ALIGN	32
+mul4x_internal:
+
+	shl	r9,5
+	movd	xmm5,DWORD[56+rax]
+	lea	rax,[$L$inc]
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r9*1+rsp]
+	lea	r12,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	DB	0x67,0x67
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	DB	0x67
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	DB	0x67
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+	pand	xmm0,XMMWORD[64+r12]
+
+	pand	xmm1,XMMWORD[80+r12]
+	pand	xmm2,XMMWORD[96+r12]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+r12]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+r12]
+	movdqa	xmm5,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+r12]
+	movdqa	xmm5,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[r12]
+	movdqa	xmm5,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	por	xmm0,xmm1
+
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((56+8))+rsp],rdi
+
+	mov	r8,QWORD[r8]
+	mov	rax,QWORD[rsi]
+	lea	rsi,[r9*1+rsi]
+	neg	r9
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	lea	r14,[((64+8))+rsp]
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+
+ALIGN	32
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	jmp	NEAR $L$outer4x
+
+ALIGN	32
+$L$outer4x:
+	lea	rdx,[((16+128))+r14]
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r12]
+	movdqa	xmm1,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm0,XMMWORD[((-128))+rdx]
+	pand	xmm1,XMMWORD[((-112))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r12]
+	movdqa	xmm1,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm0,XMMWORD[((-64))+rdx]
+	pand	xmm1,XMMWORD[((-48))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r12]
+	movdqa	xmm1,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm0,XMMWORD[rdx]
+	pand	xmm1,XMMWORD[16+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r12]
+	movdqa	xmm1,XMMWORD[80+r12]
+	movdqa	xmm2,XMMWORD[96+r12]
+	movdqa	xmm3,XMMWORD[112+r12]
+	pand	xmm0,XMMWORD[64+rdx]
+	pand	xmm1,XMMWORD[80+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	r10,QWORD[r9*1+r14]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+	mov	QWORD[r14],rdi
+
+	lea	r14,[r9*1+r14]
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+
+ALIGN	32
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	add	r10,QWORD[r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,rbp
+	mov	rbp,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mov	QWORD[((-16))+r14],rdi
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r14]
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	cmp	r12,QWORD[((16+8))+rsp]
+	jb	NEAR $L$outer4x
+	xor	rax,rax
+	sub	rbp,r13
+	adc	r15,r15
+	or	rdi,r15
+	sub	rax,rdi
+	lea	rbx,[r9*1+r14]
+	mov	r12,QWORD[rcx]
+	lea	rbp,[rcx]
+	mov	rcx,r9
+	sar	rcx,3+2
+	mov	rdi,QWORD[((56+8))+rsp]
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+
+global	bn_power5
+
+ALIGN	32
+bn_power5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_power5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	mov	rax,rsp
+
+	lea	r11,[OPENSSL_ia32cap_P]
+	mov	r11d,DWORD[8+r11]
+	and	r11d,0x80108
+	cmp	r11d,0x80108
+	je	NEAR $L$powerx5_enter
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$power5_prologue:
+
+	shl	r9d,3
+	lea	r10d,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwr_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwr_sp_done
+
+ALIGN	32
+$L$pwr_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwr_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+	jmp	NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$power5_body:
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rdi,rsi
+	mov	rax,QWORD[40+rsp]
+	lea	r8,[32+rsp]
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$power5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_power5:
+
+global	bn_sqr8x_internal
+
+
+ALIGN	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rbp,[32+r10]
+	lea	rsi,[r9*1+rsi]
+
+	mov	rcx,r9
+
+
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+	mov	r10,rdx
+
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	mov	r12,rax
+	mov	rax,rbx
+	mov	r13,rdx
+
+	lea	rcx,[rbp]
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+	jmp	NEAR $L$sqr4x_1st
+
+ALIGN	32
+$L$sqr4x_1st:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[16+rcx*1+rsi]
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	QWORD[8+rcx*1+rdi],r10
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[24+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[16+rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+	lea	rcx,[32+rcx]
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_1st
+
+	mul	r15
+	add	r13,rax
+	lea	rbp,[16+rbp]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+	jmp	NEAR $L$sqr4x_outer
+
+ALIGN	32
+$L$sqr4x_outer:
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,QWORD[((-24))+rbp*1+rdi]
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+	mov	r11,rdx
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r11,QWORD[((-16))+rbp*1+rdi]
+	mov	r10,rdx
+	adc	r10,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+
+	xor	r12,r12
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r12,QWORD[((-8))+rbp*1+rdi]
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rbp*1+rdi],r10
+
+	lea	rcx,[rbp]
+	jmp	NEAR $L$sqr4x_inner
+
+ALIGN	32
+$L$sqr4x_inner:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+	add	r13,QWORD[rcx*1+rdi]
+	adc	r12,0
+
+	DB	0x67
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	QWORD[rcx*1+rdi],r11
+	mov	rax,rbx
+	mov	r13,rdx
+	adc	r13,0
+	add	r12,QWORD[8+rcx*1+rdi]
+	lea	rcx,[16+rcx]
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_inner
+
+	DB	0x67
+	mul	r15
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	add	rbp,16
+	jnz	NEAR $L$sqr4x_outer
+
+
+	mov	r14,QWORD[((-32))+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rsi]
+	mov	r15,rax
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	QWORD[((-24))+rdi],r10
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	mov	rbx,QWORD[((-8))+rsi]
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[((-16))+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rdi],r10
+
+	mul	r15
+	add	r13,rax
+	mov	rax,QWORD[((-16))+rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	mul	rbx
+	add	rbp,16
+	xor	r14,r14
+	sub	rbp,r9
+	xor	r15,r15
+
+	add	rax,r12
+	adc	rdx,0
+	mov	QWORD[8+rdi],rax
+	mov	QWORD[16+rdi],rdx
+	mov	QWORD[24+rdi],r15
+
+	mov	rax,QWORD[((-16))+rbp*1+rsi]
+	lea	rdi,[((48+8))+rsp]
+	xor	r10,r10
+	mov	r11,QWORD[8+rdi]
+
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	lea	rbp,[16+rbp]
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$sqr4x_shift_n_add
+
+ALIGN	32
+$L$sqr4x_shift_n_add:
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[8+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[((-16))+rdi],rbx
+	adc	r8,rdx
+
+	lea	r12,[r10*2+r14]
+	mov	QWORD[((-8))+rdi],r8
+	sbb	r15,r15
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[8+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[16+rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	add	rbp,32
+	jnz	NEAR $L$sqr4x_shift_n_add
+
+	lea	r12,[r10*2+r14]
+	DB	0x67
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mul	rax
+	neg	r15
+	adc	rbx,rax
+	adc	r8,rdx
+	mov	QWORD[((-16))+rdi],rbx
+	mov	QWORD[((-8))+rdi],r8
+DB	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xor	rax,rax
+	lea	rcx,[rbp*1+r9]
+	lea	rdx,[((48+8))+r9*2+rsp]
+	mov	QWORD[((0+8))+rsp],rcx
+	lea	rdi,[((48+8))+r9*1+rsp]
+	mov	QWORD[((8+8))+rsp],rdx
+	neg	r9
+	jmp	NEAR $L$8x_reduction_loop
+
+ALIGN	32
+$L$8x_reduction_loop:
+	lea	rdi,[r9*1+rdi]
+	DB	0x66
+	mov	rbx,QWORD[rdi]
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[rdx],rax
+	lea	rdi,[64+rdi]
+
+	DB	0x67
+	mov	r8,rbx
+	imul	rbx,QWORD[((32+8))+rsp]
+	mov	rax,QWORD[rbp]
+	mov	ecx,8
+	jmp	NEAR $L$8x_reduce
+
+ALIGN	32
+$L$8x_reduce:
+	mul	rbx
+	mov	rax,QWORD[8+rbp]
+	neg	r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	rsi,QWORD[((32+8))+rsp]
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	imul	rsi,r8
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,rsi
+	add	r15,rax
+	mov	rax,QWORD[rbp]
+	adc	rdx,0
+	add	r14,r15
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_reduce
+
+	lea	rbp,[64+rbp]
+	xor	rax,rax
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_no_tail
+
+	DB	0x66
+	add	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	mov	ecx,8
+	mov	rax,QWORD[rbp]
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail:
+	mul	rbx
+	add	r8,rax
+	mov	rax,QWORD[8+rbp]
+	mov	QWORD[rdi],r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	lea	rdi,[8+rdi]
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
+	add	r15,rax
+	adc	rdx,0
+	add	r14,r15
+	mov	rax,QWORD[rbp]
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_tail
+
+	lea	rbp,[64+rbp]
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_tail_done
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	neg	rsi
+	mov	rax,QWORD[rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	ecx,8
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[rdx]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	neg	rsi
+$L$8x_no_tail:
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+	mov	rcx,QWORD[((-8))+rbp]
+	xor	rsi,rsi
+
+DB	102,72,15,126,213
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],r9
+DB	102,73,15,126,217
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+	lea	rdi,[64+rdi]
+
+	cmp	rdi,rdx
+	jb	NEAR $L$8x_reduction_loop
+	ret
+
+
+
+ALIGN	32
+__bn_post4x_internal:
+
+	mov	r12,QWORD[rbp]
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+DB	102,72,15,126,207
+	neg	rax
+DB	102,72,15,126,206
+	sar	rcx,3+2
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+ALIGN	16
+$L$sqr4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqr4x_sub_entry:
+	lea	rbp,[32+rbp]
+	not	r12
+	not	r13
+	not	r14
+	not	r15
+	and	r12,rax
+	and	r13,rax
+	and	r14,rax
+	and	r15,rax
+
+	neg	r10
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	adc	r14,QWORD[16+rbx]
+	adc	r15,QWORD[24+rbx]
+	mov	QWORD[rdi],r12
+	lea	rbx,[32+rbx]
+	mov	QWORD[8+rdi],r13
+	sbb	r10,r10
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+
+	inc	rcx
+	jnz	NEAR $L$sqr4x_sub
+
+	mov	r10,r9
+	neg	r9
+	ret
+
+
+
+ALIGN	32
+bn_mulx4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$mulx4x_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mulx4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mulx4xsp_done
+
+$L$mulx4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mulx4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$mulx4x_body:
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_mulx4x_mont_gather5:
+
+
+ALIGN	32
+mulx4x_internal:
+
+	mov	QWORD[8+rsp],r9
+	mov	r10,r9
+	neg	r9
+	shl	r9,5
+	neg	r10
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5+5
+	movd	xmm5,DWORD[56+rax]
+	sub	r9,1
+	lea	rax,[$L$inc]
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((24+8))+rsp],r9
+	mov	QWORD[((56+8))+rsp],rdi
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r10*1+rsp]
+	lea	rdi,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	DB	0x67
+	movdqa	xmm2,xmm1
+	DB	0x67
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	DB	0x67
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+
+	pand	xmm0,XMMWORD[64+rdi]
+	pand	xmm1,XMMWORD[80+rdi]
+	pand	xmm2,XMMWORD[96+rdi]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+rdi]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+rdi]
+	movdqa	xmm5,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+rdi]
+	movdqa	xmm5,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[rdi]
+	movdqa	xmm5,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	pxor	xmm0,xmm1
+
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+	lea	rbx,[((64+32+8))+rsp]
+
+	mov	r9,rdx
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r12,r11,QWORD[8+rsi]
+	add	r11,rax
+	mulx	r13,rax,QWORD[16+rsi]
+	adc	r12,rax
+	adc	r13,0
+	mulx	r14,rax,QWORD[24+rsi]
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+	xor	rbp,rbp
+	mov	rdx,r8
+
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[8+rsp]
+	adc	r15,rbp
+	lea	rsi,[rax*1+rsi]
+	add	r14,r15
+	mov	rdi,QWORD[((8+8))+rsp]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	lea	r10,[((16-256))+rbx]
+	pxor	xmm4,xmm4
+	DB	0x67,0x67
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+rdi]
+	movdqa	xmm1,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm0,XMMWORD[256+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm1,XMMWORD[272+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[288+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[304+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+rdi]
+	movdqa	xmm1,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm0,XMMWORD[320+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm1,XMMWORD[336+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[352+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[368+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[rdi]
+	movdqa	xmm1,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm0,XMMWORD[384+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm1,XMMWORD[400+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[416+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[432+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+rdi]
+	movdqa	xmm1,XMMWORD[80+rdi]
+	movdqa	xmm2,XMMWORD[96+rdi]
+	pand	xmm0,XMMWORD[448+r10]
+	movdqa	xmm3,XMMWORD[112+rdi]
+	pand	xmm1,XMMWORD[464+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[480+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[496+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+
+	mov	QWORD[rbx],rbp
+	lea	rbx,[32+rax*1+rbx]
+	mulx	r11,r8,QWORD[rsi]
+	xor	rbp,rbp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	mulx	r14,rdx,QWORD[24+rsi]
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rdx
+	lea	rcx,[rax*1+rcx]
+	lea	rsi,[32+rsi]
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	adox	r14,rbp
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+
+	mov	rdx,r8
+	xor	rbp,rbp
+	mov	QWORD[((8+8))+rsp],rdi
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-24))+rbx],r11
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r12
+	lea	rcx,[32+rcx]
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mov	QWORD[((-32))+rbx],r11
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[((0+8))+rsp]
+	adc	r15,rbp
+	sub	rdi,QWORD[rbx]
+	mov	rdi,QWORD[((8+8))+rsp]
+	mov	r10,QWORD[((16+8))+rsp]
+	adc	r14,r15
+	lea	rsi,[rax*1+rsi]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,r10
+	jb	NEAR $L$mulx4x_outer
+
+	mov	r10,QWORD[((-8))+rcx]
+	mov	r8,rbp
+	mov	r12,QWORD[rax*1+rcx]
+	lea	rbp,[rax*1+rcx]
+	mov	rcx,rax
+	lea	rdi,[rax*1+rbx]
+	xor	eax,eax
+	xor	r15,r15
+	sub	r10,r14
+	adc	r15,r15
+	or	r8,r15
+	sar	rcx,3+2
+	sub	rax,r8
+	mov	rdx,QWORD[((56+8))+rsp]
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+
+
+ALIGN	32
+bn_powerx5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_powerx5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$powerx5_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$powerx5_prologue:
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwrx_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwrx_sp_done
+
+ALIGN	32
+$L$pwrx_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwrx_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+	jmp	NEAR $L$pwrx_page_walk_done
+
+$L$pwrx_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+$L$pwrx_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$powerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	mov	r9,r10
+	mov	rdi,rsi
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rax,QWORD[40+rsp]
+
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$powerx5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_bn_powerx5:
+
+global	bn_sqrx8x_internal
+
+
+ALIGN	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rdi,[((48+8))+rsp]
+	lea	rbp,[r9*1+rsi]
+	mov	QWORD[((0+8))+rsp],r9
+	mov	QWORD[((8+8))+rsp],rbp
+	jmp	NEAR $L$sqr8x_zero_start
+
+ALIGN	32
+	DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+$L$sqrx8x_zero:
+	DB	0x3e
+	movdqa	XMMWORD[rdi],xmm0
+	movdqa	XMMWORD[16+rdi],xmm0
+	movdqa	XMMWORD[32+rdi],xmm0
+	movdqa	XMMWORD[48+rdi],xmm0
+$L$sqr8x_zero_start:
+	movdqa	XMMWORD[64+rdi],xmm0
+	movdqa	XMMWORD[80+rdi],xmm0
+	movdqa	XMMWORD[96+rdi],xmm0
+	movdqa	XMMWORD[112+rdi],xmm0
+	lea	rdi,[128+rdi]
+	sub	r9,64
+	jnz	NEAR $L$sqrx8x_zero
+
+	mov	rdx,QWORD[rsi]
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	lea	rdi,[((48+8))+rsp]
+	xor	rbp,rbp
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_loop:
+	mulx	rax,r8,QWORD[8+rsi]
+	adcx	r8,r9
+	adox	r10,rax
+	mulx	rax,r9,QWORD[16+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+	DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcx	r10,r11
+	adox	r12,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcx	r11,r12
+	adox	r13,rax
+	mulx	rax,r12,QWORD[40+rsi]
+	adcx	r12,r13
+	adox	r14,rax
+	mulx	rax,r13,QWORD[48+rsi]
+	adcx	r13,r14
+	adox	rax,r15
+	mulx	r15,r14,QWORD[56+rsi]
+	mov	rdx,QWORD[8+rsi]
+	adcx	r14,rax
+	adox	r15,rbp
+	adc	r15,QWORD[64+rdi]
+	mov	QWORD[8+rdi],r8
+	mov	QWORD[16+rdi],r9
+	sbb	rcx,rcx
+	xor	rbp,rbp
+
+
+	mulx	rbx,r8,QWORD[16+rsi]
+	mulx	rax,r9,QWORD[24+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[32+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,rbx
+	DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcx	r11,r13
+	adox	r12,r14
+	DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[16+rsi]
+	adcx	r12,rax
+	adox	r13,rbx
+	adcx	r13,r15
+	adox	r14,rbp
+	adcx	r14,rbp
+
+	mov	QWORD[24+rdi],r8
+	mov	QWORD[32+rdi],r9
+
+	mulx	rbx,r8,QWORD[24+rsi]
+	mulx	rax,r9,QWORD[32+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[40+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,r13
+	DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+	DB	0x3e
+	mov	rdx,QWORD[24+rsi]
+	adcx	r11,rbx
+	adox	r12,rax
+	adcx	r12,r14
+	mov	QWORD[40+rdi],r8
+	mov	QWORD[48+rdi],r9
+	mulx	rax,r8,QWORD[32+rsi]
+	adox	r13,rbp
+	adcx	r13,rbp
+
+	mulx	rbx,r9,QWORD[40+rsi]
+	adcx	r8,r10
+	adox	r9,rax
+	mulx	rax,r10,QWORD[48+rsi]
+	adcx	r9,r11
+	adox	r10,r12
+	mulx	r12,r11,QWORD[56+rsi]
+	mov	rdx,QWORD[32+rsi]
+	mov	r14,QWORD[40+rsi]
+	adcx	r10,rbx
+	adox	r11,rax
+	mov	r15,QWORD[48+rsi]
+	adcx	r11,r13
+	adox	r12,rbp
+	adcx	r12,rbp
+
+	mov	QWORD[56+rdi],r8
+	mov	QWORD[64+rdi],r9
+
+	mulx	rax,r9,r14
+	mov	r8,QWORD[56+rsi]
+	adcx	r9,r10
+	mulx	rbx,r10,r15
+	adox	r10,rax
+	adcx	r10,r11
+	mulx	rax,r11,r8
+	mov	rdx,r14
+	adox	r11,rbx
+	adcx	r11,r12
+
+	adcx	rax,rbp
+
+	mulx	rbx,r14,r15
+	mulx	r13,r12,r8
+	mov	rdx,r15
+	lea	rsi,[64+rsi]
+	adcx	r11,r14
+	adox	r12,rbx
+	adcx	r12,rax
+	adox	r13,rbp
+
+	DB	0x67,0x67
+	mulx	r14,r8,r8
+	adcx	r13,r8
+	adcx	r14,rbp
+
+	cmp	rsi,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_outer_break
+
+	neg	rcx
+	mov	rcx,-8
+	mov	r15,rbp
+	mov	r8,QWORD[64+rdi]
+	adcx	r9,QWORD[72+rdi]
+	adcx	r10,QWORD[80+rdi]
+	adcx	r11,QWORD[88+rdi]
+	adc	r12,QWORD[96+rdi]
+	adc	r13,QWORD[104+rdi]
+	adc	r14,QWORD[112+rdi]
+	adc	r15,QWORD[120+rdi]
+	lea	rbp,[rsi]
+	lea	rdi,[128+rdi]
+	sbb	rax,rax
+
+	mov	rdx,QWORD[((-64))+rsi]
+	mov	QWORD[((16+8))+rsp],rax
+	mov	QWORD[((24+8))+rsp],rdi
+
+
+	xor	eax,eax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_loop:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	ebx,0
+	adcx	r13,rax
+	adox	r14,r15
+
+	DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[8+rcx*8+rsi]
+	adcx	r14,rax
+	adox	r15,rbx
+	adcx	r15,rbx
+
+	DB	0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_loop
+
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	cmp	rbp,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_break
+
+	sub	rbx,QWORD[((16+8))+rsp]
+	DB	0x66
+	mov	rdx,QWORD[((-64))+rsi]
+	adcx	r8,QWORD[rdi]
+	adcx	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	DB	0x67
+	sbb	rax,rax
+	xor	ebx,ebx
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_break:
+	xor	rbp,rbp
+	sub	rbx,QWORD[((16+8))+rsp]
+	adcx	r8,rbp
+	mov	rcx,QWORD[((24+8))+rsp]
+	adcx	r9,rbp
+	mov	rdx,QWORD[rsi]
+	adc	r10,0
+	mov	QWORD[rdi],r8
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	cmp	rdi,rcx
+	je	NEAR $L$sqrx8x_outer_loop
+
+	mov	QWORD[8+rdi],r9
+	mov	r9,QWORD[8+rcx]
+	mov	QWORD[16+rdi],r10
+	mov	r10,QWORD[16+rcx]
+	mov	QWORD[24+rdi],r11
+	mov	r11,QWORD[24+rcx]
+	mov	QWORD[32+rdi],r12
+	mov	r12,QWORD[32+rcx]
+	mov	QWORD[40+rdi],r13
+	mov	r13,QWORD[40+rcx]
+	mov	QWORD[48+rdi],r14
+	mov	r14,QWORD[48+rcx]
+	mov	QWORD[56+rdi],r15
+	mov	r15,QWORD[56+rcx]
+	mov	rdi,rcx
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_break:
+	mov	QWORD[72+rdi],r9
+DB	102,72,15,126,217
+	mov	QWORD[80+rdi],r10
+	mov	QWORD[88+rdi],r11
+	mov	QWORD[96+rdi],r12
+	mov	QWORD[104+rdi],r13
+	mov	QWORD[112+rdi],r14
+	lea	rdi,[((48+8))+rsp]
+	mov	rdx,QWORD[rcx*1+rsi]
+
+	mov	r11,QWORD[8+rdi]
+	xor	r10,r10
+	mov	r9,QWORD[((0+8))+rsp]
+	adox	r11,r11
+	mov	r12,QWORD[16+rdi]
+	mov	r13,QWORD[24+rdi]
+
+
+ALIGN	32
+$L$sqrx4x_shift_n_add:
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+	DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+	DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[40+rdi]
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	mov	rdx,QWORD[16+rcx*1+rsi]
+	mov	r12,QWORD[48+rdi]
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r13,QWORD[56+rdi]
+	mov	QWORD[16+rdi],rax
+	mov	QWORD[24+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+	mov	rdx,QWORD[24+rcx*1+rsi]
+	lea	rcx,[32+rcx]
+	mov	r10,QWORD[64+rdi]
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[72+rdi]
+	mov	QWORD[32+rdi],rax
+	mov	QWORD[40+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	jrcxz	$L$sqrx4x_shift_n_add_break
+	DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r12,QWORD[80+rdi]
+	mov	r13,QWORD[88+rdi]
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+	nop
+	jmp	NEAR $L$sqrx4x_shift_n_add
+
+ALIGN	32
+$L$sqrx4x_shift_n_add_break:
+	adcx	rbx,r13
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+DB	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xor	eax,eax
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rcx,[((-64))+r9*1+rbp]
+
+	mov	QWORD[((0+8))+rsp],rcx
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rdi,[((48+8))+rsp]
+	jmp	NEAR $L$sqrx8x_reduction_loop
+
+ALIGN	32
+$L$sqrx8x_reduction_loop:
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r8,rdx
+	imul	rdx,rbx
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[((24+8))+rsp],rax
+
+	lea	rdi,[64+rdi]
+	xor	rsi,rsi
+	mov	rcx,-8
+	jmp	NEAR $L$sqrx8x_reduce
+
+ALIGN	32
+$L$sqrx8x_reduce:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rax,rbx
+	adox	r8,r9
+
+	mulx	r9,rbx,QWORD[8+rbp]
+	adcx	r8,rbx
+	adox	r9,r10
+
+	mulx	r10,rbx,QWORD[16+rbp]
+	adcx	r9,rbx
+	adox	r10,r11
+
+	mulx	r11,rbx,QWORD[24+rbp]
+	adcx	r10,rbx
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	mov	rax,rdx
+	mov	rdx,r8
+	adcx	r11,rbx
+	adox	r12,r13
+
+	mulx	rdx,rbx,QWORD[((32+8))+rsp]
+	mov	rdx,rax
+	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,rbx
+	adcx	r14,rax
+	adox	r15,rsi
+	adcx	r15,rsi
+
+	DB	0x67,0x67,0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_reduce
+
+	mov	rax,rsi
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_no_tail
+
+	mov	rdx,QWORD[((48+8))+rsp]
+	add	r8,QWORD[rdi]
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	adcx	r9,QWORD[8+rdi]
+	adcx	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
+	adcx	r14,rax
+	adox	r15,rsi
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	rbx,r8
+	adcx	r15,rsi
+
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_tail
+
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_tail_done
+
+	sub	rsi,QWORD[((16+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rbp,[64+rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+	sub	rcx,8
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[((24+8))+rsp]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	sub	rsi,QWORD[((16+8))+rsp]
+$L$sqrx8x_no_tail:
+	adc	r8,QWORD[rdi]
+DB	102,72,15,126,217
+	adc	r9,QWORD[8+rdi]
+	mov	rsi,QWORD[56+rbp]
+DB	102,72,15,126,213
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[64+rcx*1+rdi]
+
+	mov	QWORD[rdi],r8
+	lea	r8,[64+rdi]
+	mov	QWORD[8+rdi],r9
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+
+	lea	rdi,[64+rcx*1+rdi]
+	cmp	r8,QWORD[((8+8))+rsp]
+	jb	NEAR $L$sqrx8x_reduction_loop
+	ret
+
+
+ALIGN	32
+
+__bn_postx4x_internal:
+
+	mov	r12,QWORD[rbp]
+	mov	r10,rcx
+	mov	r9,rcx
+	neg	rax
+	sar	rcx,3+2
+
+DB	102,72,15,126,202
+DB	102,72,15,126,206
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+ALIGN	16
+$L$sqrx4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqrx4x_sub_entry:
+	andn	r12,r12,rax
+	lea	rbp,[32+rbp]
+	andn	r13,r13,rax
+	andn	r14,r14,rax
+	andn	r15,r15,rax
+
+	neg	r8
+	adc	r12,QWORD[rdi]
+	adc	r13,QWORD[8+rdi]
+	adc	r14,QWORD[16+rdi]
+	adc	r15,QWORD[24+rdi]
+	mov	QWORD[rdx],r12
+	lea	rdi,[32+rdi]
+	mov	QWORD[8+rdx],r13
+	sbb	r8,r8
+	mov	QWORD[16+rdx],r14
+	mov	QWORD[24+rdx],r15
+	lea	rdx,[32+rdx]
+
+	inc	rcx
+	jnz	NEAR $L$sqrx4x_sub
+
+	neg	r9
+
+	ret
+
+
+global	bn_scatter5
+
+ALIGN	16
+bn_scatter5:
+
+_CET_ENDBR
+	cmp	edx,0
+	jz	NEAR $L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+	lea	r8,[r9*8+r8]
+$L$scatter:
+	mov	rax,QWORD[rcx]
+	lea	rcx,[8+rcx]
+	mov	QWORD[r8],rax
+	lea	r8,[256+r8]
+	sub	edx,1
+	jnz	NEAR $L$scatter
+$L$scatter_epilogue:
+	ret
+
+
+
+global	bn_gather5
+
+ALIGN	32
+bn_gather5:
+
+$L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+	DB	0x4c,0x8d,0x14,0x24
+
+	DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	lea	rax,[$L$inc]
+	and	rsp,-16
+
+	movd	xmm5,r9d
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r11,[128+r8]
+	lea	rax,[128+rsp]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-128)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-112)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-96)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-80)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-64)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-48)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-32)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-16)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[16+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[32+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[48+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[64+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[80+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[96+rax],xmm2
+	movdqa	xmm2,xmm4
+	movdqa	XMMWORD[112+rax],xmm3
+	jmp	NEAR $L$gather
+
+ALIGN	32
+$L$gather:
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r11]
+	movdqa	xmm1,XMMWORD[((-112))+r11]
+	movdqa	xmm2,XMMWORD[((-96))+r11]
+	pand	xmm0,XMMWORD[((-128))+rax]
+	movdqa	xmm3,XMMWORD[((-80))+r11]
+	pand	xmm1,XMMWORD[((-112))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r11]
+	movdqa	xmm1,XMMWORD[((-48))+r11]
+	movdqa	xmm2,XMMWORD[((-32))+r11]
+	pand	xmm0,XMMWORD[((-64))+rax]
+	movdqa	xmm3,XMMWORD[((-16))+r11]
+	pand	xmm1,XMMWORD[((-48))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r11]
+	movdqa	xmm1,XMMWORD[16+r11]
+	movdqa	xmm2,XMMWORD[32+r11]
+	pand	xmm0,XMMWORD[rax]
+	movdqa	xmm3,XMMWORD[48+r11]
+	pand	xmm1,XMMWORD[16+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r11]
+	movdqa	xmm1,XMMWORD[80+r11]
+	movdqa	xmm2,XMMWORD[96+r11]
+	pand	xmm0,XMMWORD[64+rax]
+	movdqa	xmm3,XMMWORD[112+r11]
+	pand	xmm1,XMMWORD[80+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	lea	r11,[256+r11]
+
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	movq	QWORD[rcx],xmm0
+	lea	rcx,[8+rcx]
+	sub	edx,1
+	jnz	NEAR $L$gather
+
+	lea	rsp,[r10]
+
+	ret
+$L$SEH_end_bn_gather5:
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$inc:
+	DD	0,0,1,1
+	DD	2,2,2,2
+	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+	DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
+	DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
+	DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
+	DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
+	DB	112,101,110,115,115,108,46,111,114,103,62,0
+section	.text
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	r10,[$L$mul_epilogue]
+	cmp	rbx,r10
+	ja	NEAR $L$body_40
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+$L$body_40:
+	mov	rax,QWORD[40+rax]
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
+	DD	$L$SEH_end_bn_power5 wrt ..imagebase
+	DD	$L$SEH_info_bn_power5 wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_info_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_bn_mul_mont_gather5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mul4x_mont_gather5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_power5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont_gather5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_powerx5:
+	DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_gather5:
+	DB	0x01,0x0b,0x03,0x0a
+	DB	0x0b,0x01,0x21,0x00
+	DB	0x04,0xa3,0x00,0x00
+ALIGN	8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-apple.S b/gen/crypto/aes128gcmsiv-x86_64-apple.S
new file mode 100644
index 0000000..81e2f07
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-apple.S
@@ -0,0 +1,3081 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section	__DATA,__const
+
+.p2align	4
+one:
+.quad	1,0
+two:
+.quad	2,0
+three:
+.quad	3,0
+four:
+.quad	4,0
+five:
+.quad	5,0
+six:
+.quad	6,0
+seven:
+.quad	7,0
+eight:
+.quad	8,0
+
+OR_MASK:
+.long	0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad	0x1, 0xc200000000000000
+mask:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long	1,1,1,1
+con2:
+.long	0x1b,0x1b,0x1b,0x1b
+con3:
+.byte	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long	0,0xffffffff, 0xffffffff, 0xffffffff
+.text	
+
+.p2align	4
+GFMUL:
+
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm5,%xmm5
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpxor	%xmm5,%xmm2,%xmm0
+	ret
+
+
+.globl	_aesgcmsiv_htable_init
+.private_extern _aesgcmsiv_htable_init
+
+.p2align	4
+_aesgcmsiv_htable_init:
+
+_CET_ENDBR
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,96(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,112(%rdi)
+	ret
+
+
+.globl	_aesgcmsiv_htable6_init
+.private_extern _aesgcmsiv_htable6_init
+
+.p2align	4
+_aesgcmsiv_htable6_init:
+
+_CET_ENDBR
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	ret
+
+
+.globl	_aesgcmsiv_htable_polyval
+.private_extern _aesgcmsiv_htable_polyval
+
+.p2align	4
+_aesgcmsiv_htable_polyval:
+
+_CET_ENDBR
+	testq	%rdx,%rdx
+	jnz	L$htable_polyval_start
+	ret
+
+L$htable_polyval_start:
+	vzeroall
+
+
+
+	movq	%rdx,%r11
+	andq	$127,%r11
+
+	jz	L$htable_polyval_no_prefix
+
+	vpxor	%xmm9,%xmm9,%xmm9
+	vmovdqa	(%rcx),%xmm1
+	subq	%r11,%rdx
+
+	subq	$16,%r11
+
+
+	vmovdqu	(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	leaq	16(%rsi),%rsi
+	testq	%r11,%r11
+	jnz	L$htable_polyval_prefix_loop
+	jmp	L$htable_polyval_prefix_complete
+
+
+.p2align	6
+L$htable_polyval_prefix_loop:
+	subq	$16,%r11
+
+	vmovdqu	(%rsi),%xmm0
+
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	testq	%r11,%r11
+
+	leaq	16(%rsi),%rsi
+
+	jnz	L$htable_polyval_prefix_loop
+
+L$htable_polyval_prefix_complete:
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	jmp	L$htable_polyval_main_loop
+
+L$htable_polyval_no_prefix:
+
+
+
+
+	vpxor	%xmm1,%xmm1,%xmm1
+	vmovdqa	(%rcx),%xmm9
+
+.p2align	6
+L$htable_polyval_main_loop:
+	subq	$0x80,%rdx
+	jb	L$htable_polyval_out
+
+	vmovdqu	112(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,(%rdi),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	96(%rsi),%xmm0
+	vpclmulqdq	$0x01,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+
+	vmovdqu	80(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	64(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	48(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	32(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	16(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	leaq	128(%rsi),%rsi
+	jmp	L$htable_polyval_main_loop
+
+
+
+L$htable_polyval_out:
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rcx)
+	vzeroupper
+	ret
+
+
+.globl	_aesgcmsiv_polyval_horner
+.private_extern _aesgcmsiv_polyval_horner
+
+.p2align	4
+_aesgcmsiv_polyval_horner:
+
+_CET_ENDBR
+	testq	%rcx,%rcx
+	jnz	L$polyval_horner_start
+	ret
+
+L$polyval_horner_start:
+
+
+
+	xorq	%r10,%r10
+	shlq	$4,%rcx
+
+	vmovdqa	(%rsi),%xmm1
+	vmovdqa	(%rdi),%xmm0
+
+L$polyval_horner_loop:
+	vpxor	(%rdx,%r10,1),%xmm0,%xmm0
+	call	GFMUL
+
+	addq	$16,%r10
+	cmpq	%r10,%rcx
+	jne	L$polyval_horner_loop
+
+
+	vmovdqa	%xmm0,(%rdi)
+	ret
+
+
+.globl	_aes128gcmsiv_aes_ks
+.private_extern _aes128gcmsiv_aes_ks
+
+.p2align	4
+_aes128gcmsiv_aes_ks:
+
+_CET_ENDBR
+	vmovdqu	(%rdi),%xmm1
+	vmovdqa	%xmm1,(%rsi)
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	movq	$8,%rax
+
+L$ks128_loop:
+	addq	$16,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	jne	L$ks128_loop
+
+	vmovdqa	con2(%rip),%xmm0
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,16(%rsi)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	ret
+
+
+.globl	_aes256gcmsiv_aes_ks
+.private_extern _aes256gcmsiv_aes_ks
+
+.p2align	4
+_aes256gcmsiv_aes_ks:
+
+_CET_ENDBR
+	vmovdqu	(%rdi),%xmm1
+	vmovdqu	16(%rdi),%xmm3
+	vmovdqa	%xmm1,(%rsi)
+	vmovdqa	%xmm3,16(%rsi)
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vpxor	%xmm14,%xmm14,%xmm14
+	movq	$6,%rax
+
+L$ks256_loop:
+	addq	$32,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpsllq	$32,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpshufb	con3(%rip),%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vmovdqa	%xmm3,16(%rsi)
+	jne	L$ks256_loop
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	ret
+
+.globl	_aes128gcmsiv_aes_ks_enc_x1
+.private_extern _aes128gcmsiv_aes_ks_enc_x1
+
+.p2align	4
+_aes128gcmsiv_aes_ks_enc_x1:
+
+_CET_ENDBR
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+
+	vmovdqa	%xmm1,(%rdx)
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,16(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,32(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,48(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,64(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,80(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,96(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,112(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,128(%rdx)
+
+
+	vmovdqa	con2(%rip),%xmm0
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,144(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenclast	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,160(%rdx)
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	ret
+
+
+.globl	_aes128gcmsiv_kdf
+.private_extern _aes128gcmsiv_kdf
+
+.p2align	4
+_aes128gcmsiv_kdf:
+
+_CET_ENDBR
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm9
+	vmovdqa	and_mask(%rip),%xmm12
+	vmovdqa	one(%rip),%xmm13
+	vpshufd	$0x90,%xmm9,%xmm9
+	vpand	%xmm12,%xmm9,%xmm9
+	vpaddd	%xmm13,%xmm9,%xmm10
+	vpaddd	%xmm13,%xmm10,%xmm11
+	vpaddd	%xmm13,%xmm11,%xmm12
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vaesenclast	%xmm2,%xmm10,%xmm10
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+
+
+	vmovdqa	%xmm9,0(%rsi)
+	vmovdqa	%xmm10,16(%rsi)
+	vmovdqa	%xmm11,32(%rsi)
+	vmovdqa	%xmm12,48(%rsi)
+	ret
+
+
+.globl	_aes128gcmsiv_enc_msg_x4
+.private_extern _aes128gcmsiv_enc_msg_x4
+
+.p2align	4
+_aes128gcmsiv_enc_msg_x4:
+
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	L$128_enc_msg_x4_start
+	ret
+
+L$128_enc_msg_x4_start:
+	pushq	%r12
+
+	pushq	%r13
+
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqu	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	L$128_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+L$128_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	L$128_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+L$128_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	L$128_enc_msg_x4_out
+
+L$128_enc_msg_x4_loop2:
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenclast	160(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	L$128_enc_msg_x4_loop2
+
+L$128_enc_msg_x4_out:
+	popq	%r13
+
+	popq	%r12
+
+	ret
+
+
+.globl	_aes128gcmsiv_enc_msg_x8
+.private_extern _aes128gcmsiv_enc_msg_x8
+
+.p2align	4
+_aes128gcmsiv_enc_msg_x8:
+
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	L$128_enc_msg_x8_start
+	ret
+
+L$128_enc_msg_x8_start:
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%rbp
+
+	movq	%rsp,%rbp
+
+
+
+	subq	$128,%rsp
+	andq	$-64,%rsp
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqu	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqu	%xmm0,(%rsp)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	je	L$128_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+L$128_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqu	(%rsp),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	(%rsp),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqu	%xmm14,(%rsp)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	decq	%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	L$128_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+L$128_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	L$128_enc_msg_x8_out
+
+L$128_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenclast	160(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	decq	%r10
+	jne	L$128_enc_msg_x8_loop2
+
+L$128_enc_msg_x8_out:
+	movq	%rbp,%rsp
+
+	popq	%rbp
+
+	popq	%r13
+
+	popq	%r12
+
+	ret
+
+
+.globl	_aes128gcmsiv_dec
+.private_extern _aes128gcmsiv_dec
+
+.p2align	4
+_aes128gcmsiv_dec:
+
+_CET_ENDBR
+	testq	$~15,%r9
+	jnz	L$128_dec_start
+	ret
+
+L$128_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+
+
+	vmovdqu	16(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	L$128_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	L$128_dec_loop1
+
+
+.p2align	6
+L$128_dec_loop1:
+	cmpq	$96,%r9
+	jb	L$128_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	L$128_dec_loop1
+
+L$128_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+L$128_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	L$128_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenclast	160(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	L$128_dec_loop2
+
+L$128_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	ret
+
+
+.globl	_aes128gcmsiv_ecb_enc_block
+.private_extern _aes128gcmsiv_ecb_enc_block
+
+.p2align	4
+_aes128gcmsiv_ecb_enc_block:
+
+_CET_ENDBR
+	vmovdqa	(%rdi),%xmm1
+
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenclast	160(%rdx),%xmm1,%xmm1
+
+	vmovdqa	%xmm1,(%rsi)
+
+	ret
+
+
+.globl	_aes256gcmsiv_aes_ks_enc_x1
+.private_extern _aes256gcmsiv_aes_ks_enc_x1
+
+.p2align	4
+_aes256gcmsiv_aes_ks_enc_x1:
+
+_CET_ENDBR
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vmovdqa	(%rdi),%xmm8
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	16(%rcx),%xmm3
+	vpxor	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm1,(%rdx)
+	vmovdqu	%xmm3,16(%rdx)
+	vpxor	%xmm14,%xmm14,%xmm14
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,32(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,48(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,64(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,80(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,96(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,112(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,128(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,144(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,160(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,176(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,192(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,208(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenclast	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,224(%rdx)
+
+	vmovdqa	%xmm8,(%rsi)
+	ret
+
+
+.globl	_aes256gcmsiv_ecb_enc_block
+.private_extern _aes256gcmsiv_ecb_enc_block
+
+.p2align	4
+_aes256gcmsiv_ecb_enc_block:
+
+_CET_ENDBR
+	vmovdqa	(%rdi),%xmm1
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenc	160(%rdx),%xmm1,%xmm1
+	vaesenc	176(%rdx),%xmm1,%xmm1
+	vaesenc	192(%rdx),%xmm1,%xmm1
+	vaesenc	208(%rdx),%xmm1,%xmm1
+	vaesenclast	224(%rdx),%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	ret
+
+
+.globl	_aes256gcmsiv_enc_msg_x4
+.private_extern _aes256gcmsiv_enc_msg_x4
+
+.p2align	4
+_aes256gcmsiv_enc_msg_x4:
+
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	L$256_enc_msg_x4_start
+	ret
+
+L$256_enc_msg_x4_start:
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	L$256_enc_msg_x4_start2
+	addq	$1,%r8
+
+L$256_enc_msg_x4_start2:
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqa	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	L$256_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+L$256_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	L$256_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+L$256_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	L$256_enc_msg_x4_out
+
+L$256_enc_msg_x4_loop2:
+
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenc	160(%rcx),%xmm5,%xmm5
+	vaesenc	176(%rcx),%xmm5,%xmm5
+	vaesenc	192(%rcx),%xmm5,%xmm5
+	vaesenc	208(%rcx),%xmm5,%xmm5
+	vaesenclast	224(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	L$256_enc_msg_x4_loop2
+
+L$256_enc_msg_x4_out:
+	ret
+
+
+.globl	_aes256gcmsiv_enc_msg_x8
+.private_extern _aes256gcmsiv_enc_msg_x8
+
+.p2align	4
+_aes256gcmsiv_enc_msg_x8:
+
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	L$256_enc_msg_x8_start
+	ret
+
+L$256_enc_msg_x8_start:
+
+	movq	%rsp,%r11
+	subq	$16,%r11
+	andq	$-64,%r11
+
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	L$256_enc_msg_x8_start2
+	addq	$1,%r8
+
+L$256_enc_msg_x8_start2:
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqa	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqa	%xmm0,(%r11)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	jz	L$256_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+L$256_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqa	(%r11),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqa	(%r11),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqa	%xmm14,(%r11)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	L$256_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+L$256_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	L$256_enc_msg_x8_out
+
+L$256_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenc	160(%rcx),%xmm1,%xmm1
+	vaesenc	176(%rcx),%xmm1,%xmm1
+	vaesenc	192(%rcx),%xmm1,%xmm1
+	vaesenc	208(%rcx),%xmm1,%xmm1
+	vaesenclast	224(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+	subq	$1,%r10
+	jnz	L$256_enc_msg_x8_loop2
+
+L$256_enc_msg_x8_out:
+	ret
+
+
+
+.globl	_aes256gcmsiv_dec
+.private_extern _aes256gcmsiv_dec
+
+.p2align	4
+_aes256gcmsiv_dec:
+
+_CET_ENDBR
+	testq	$~15,%r9
+	jnz	L$256_dec_start
+	ret
+
+L$256_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+
+
+	vmovdqu	16(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	L$256_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	L$256_dec_loop1
+
+
+.p2align	6
+L$256_dec_loop1:
+	cmpq	$96,%r9
+	jb	L$256_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	L$256_dec_loop1
+
+L$256_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+L$256_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	L$256_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenc	160(%r8),%xmm2,%xmm2
+	vaesenc	176(%r8),%xmm2,%xmm2
+	vaesenc	192(%r8),%xmm2,%xmm2
+	vaesenc	208(%r8),%xmm2,%xmm2
+	vaesenclast	224(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	L$256_dec_loop2
+
+L$256_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	ret
+
+
+.globl	_aes256gcmsiv_kdf
+.private_extern _aes256gcmsiv_kdf
+
+.p2align	4
+_aes256gcmsiv_kdf:
+
+_CET_ENDBR
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+	vmovdqa	and_mask(%rip),%xmm11
+	vmovdqa	one(%rip),%xmm8
+	vpshufd	$0x90,%xmm4,%xmm4
+	vpand	%xmm11,%xmm4,%xmm4
+	vpaddd	%xmm8,%xmm4,%xmm6
+	vpaddd	%xmm8,%xmm6,%xmm7
+	vpaddd	%xmm8,%xmm7,%xmm11
+	vpaddd	%xmm8,%xmm11,%xmm12
+	vpaddd	%xmm8,%xmm12,%xmm13
+
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	176(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	192(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	208(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	224(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm4,%xmm4
+	vaesenclast	%xmm2,%xmm6,%xmm6
+	vaesenclast	%xmm2,%xmm7,%xmm7
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	vmovdqa	%xmm6,16(%rsi)
+	vmovdqa	%xmm7,32(%rsi)
+	vmovdqa	%xmm11,48(%rsi)
+	vmovdqa	%xmm12,64(%rsi)
+	vmovdqa	%xmm13,80(%rsi)
+	ret
+
+
+#endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-linux.S b/gen/crypto/aes128gcmsiv-x86_64-linux.S
new file mode 100644
index 0000000..a8de4a9
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-linux.S
@@ -0,0 +1,3091 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section	.rodata
+
+.align	16
+one:
+.quad	1,0
+two:
+.quad	2,0
+three:
+.quad	3,0
+four:
+.quad	4,0
+five:
+.quad	5,0
+six:
+.quad	6,0
+seven:
+.quad	7,0
+eight:
+.quad	8,0
+
+OR_MASK:
+.long	0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad	0x1, 0xc200000000000000
+mask:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long	1,1,1,1
+con2:
+.long	0x1b,0x1b,0x1b,0x1b
+con3:
+.byte	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long	0,0xffffffff, 0xffffffff, 0xffffffff
+.text	
+.type	GFMUL,@function
+.align	16
+GFMUL:
+.cfi_startproc	
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm5,%xmm5
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpxor	%xmm5,%xmm2,%xmm0
+	ret
+.cfi_endproc	
+.size	GFMUL, .-GFMUL
+.globl	aesgcmsiv_htable_init
+.hidden aesgcmsiv_htable_init
+.type	aesgcmsiv_htable_init,@function
+.align	16
+aesgcmsiv_htable_init:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,96(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,112(%rdi)
+	ret
+.cfi_endproc	
+.size	aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
+.globl	aesgcmsiv_htable6_init
+.hidden aesgcmsiv_htable6_init
+.type	aesgcmsiv_htable6_init,@function
+.align	16
+aesgcmsiv_htable6_init:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	ret
+.cfi_endproc	
+.size	aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
+.globl	aesgcmsiv_htable_polyval
+.hidden aesgcmsiv_htable_polyval
+.type	aesgcmsiv_htable_polyval,@function
+.align	16
+aesgcmsiv_htable_polyval:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%rdx,%rdx
+	jnz	.Lhtable_polyval_start
+	ret
+
+.Lhtable_polyval_start:
+	vzeroall
+
+
+
+	movq	%rdx,%r11
+	andq	$127,%r11
+
+	jz	.Lhtable_polyval_no_prefix
+
+	vpxor	%xmm9,%xmm9,%xmm9
+	vmovdqa	(%rcx),%xmm1
+	subq	%r11,%rdx
+
+	subq	$16,%r11
+
+
+	vmovdqu	(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	leaq	16(%rsi),%rsi
+	testq	%r11,%r11
+	jnz	.Lhtable_polyval_prefix_loop
+	jmp	.Lhtable_polyval_prefix_complete
+
+
+.align	64
+.Lhtable_polyval_prefix_loop:
+	subq	$16,%r11
+
+	vmovdqu	(%rsi),%xmm0
+
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	testq	%r11,%r11
+
+	leaq	16(%rsi),%rsi
+
+	jnz	.Lhtable_polyval_prefix_loop
+
+.Lhtable_polyval_prefix_complete:
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	jmp	.Lhtable_polyval_main_loop
+
+.Lhtable_polyval_no_prefix:
+
+
+
+
+	vpxor	%xmm1,%xmm1,%xmm1
+	vmovdqa	(%rcx),%xmm9
+
+.align	64
+.Lhtable_polyval_main_loop:
+	subq	$0x80,%rdx
+	jb	.Lhtable_polyval_out
+
+	vmovdqu	112(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,(%rdi),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	96(%rsi),%xmm0
+	vpclmulqdq	$0x01,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+
+	vmovdqu	80(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	64(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	48(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	32(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	16(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	leaq	128(%rsi),%rsi
+	jmp	.Lhtable_polyval_main_loop
+
+
+
+.Lhtable_polyval_out:
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rcx)
+	vzeroupper
+	ret
+.cfi_endproc	
+.size	aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
+.globl	aesgcmsiv_polyval_horner
+.hidden aesgcmsiv_polyval_horner
+.type	aesgcmsiv_polyval_horner,@function
+.align	16
+aesgcmsiv_polyval_horner:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%rcx,%rcx
+	jnz	.Lpolyval_horner_start
+	ret
+
+.Lpolyval_horner_start:
+
+
+
+	xorq	%r10,%r10
+	shlq	$4,%rcx
+
+	vmovdqa	(%rsi),%xmm1
+	vmovdqa	(%rdi),%xmm0
+
+.Lpolyval_horner_loop:
+	vpxor	(%rdx,%r10,1),%xmm0,%xmm0
+	call	GFMUL
+
+	addq	$16,%r10
+	cmpq	%r10,%rcx
+	jne	.Lpolyval_horner_loop
+
+
+	vmovdqa	%xmm0,(%rdi)
+	ret
+.cfi_endproc	
+.size	aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
+.globl	aes128gcmsiv_aes_ks
+.hidden aes128gcmsiv_aes_ks
+.type	aes128gcmsiv_aes_ks,@function
+.align	16
+aes128gcmsiv_aes_ks:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqu	(%rdi),%xmm1
+	vmovdqa	%xmm1,(%rsi)
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	movq	$8,%rax
+
+.Lks128_loop:
+	addq	$16,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	jne	.Lks128_loop
+
+	vmovdqa	con2(%rip),%xmm0
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,16(%rsi)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
+.globl	aes256gcmsiv_aes_ks
+.hidden aes256gcmsiv_aes_ks
+.type	aes256gcmsiv_aes_ks,@function
+.align	16
+aes256gcmsiv_aes_ks:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqu	(%rdi),%xmm1
+	vmovdqu	16(%rdi),%xmm3
+	vmovdqa	%xmm1,(%rsi)
+	vmovdqa	%xmm3,16(%rsi)
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vpxor	%xmm14,%xmm14,%xmm14
+	movq	$6,%rax
+
+.Lks256_loop:
+	addq	$32,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpsllq	$32,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpshufb	con3(%rip),%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vmovdqa	%xmm3,16(%rsi)
+	jne	.Lks256_loop
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	ret
+.cfi_endproc	
+.globl	aes128gcmsiv_aes_ks_enc_x1
+.hidden aes128gcmsiv_aes_ks_enc_x1
+.type	aes128gcmsiv_aes_ks_enc_x1,@function
+.align	16
+aes128gcmsiv_aes_ks_enc_x1:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+
+	vmovdqa	%xmm1,(%rdx)
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,16(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,32(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,48(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,64(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,80(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,96(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,112(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,128(%rdx)
+
+
+	vmovdqa	con2(%rip),%xmm0
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,144(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenclast	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,160(%rdx)
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
+.globl	aes128gcmsiv_kdf
+.hidden aes128gcmsiv_kdf
+.type	aes128gcmsiv_kdf,@function
+.align	16
+aes128gcmsiv_kdf:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm9
+	vmovdqa	and_mask(%rip),%xmm12
+	vmovdqa	one(%rip),%xmm13
+	vpshufd	$0x90,%xmm9,%xmm9
+	vpand	%xmm12,%xmm9,%xmm9
+	vpaddd	%xmm13,%xmm9,%xmm10
+	vpaddd	%xmm13,%xmm10,%xmm11
+	vpaddd	%xmm13,%xmm11,%xmm12
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vaesenclast	%xmm2,%xmm10,%xmm10
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+
+
+	vmovdqa	%xmm9,0(%rsi)
+	vmovdqa	%xmm10,16(%rsi)
+	vmovdqa	%xmm11,32(%rsi)
+	vmovdqa	%xmm12,48(%rsi)
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
+.globl	aes128gcmsiv_enc_msg_x4
+.hidden aes128gcmsiv_enc_msg_x4
+.type	aes128gcmsiv_enc_msg_x4,@function
+.align	16
+aes128gcmsiv_enc_msg_x4:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	.L128_enc_msg_x4_start
+	ret
+
+.L128_enc_msg_x4_start:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqu	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	.L128_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+.L128_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	.L128_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+.L128_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	.L128_enc_msg_x4_out
+
+.L128_enc_msg_x4_loop2:
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenclast	160(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	.L128_enc_msg_x4_loop2
+
+.L128_enc_msg_x4_out:
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
+.globl	aes128gcmsiv_enc_msg_x8
+.hidden aes128gcmsiv_enc_msg_x8
+.type	aes128gcmsiv_enc_msg_x8,@function
+.align	16
+aes128gcmsiv_enc_msg_x8:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	.L128_enc_msg_x8_start
+	ret
+
+.L128_enc_msg_x8_start:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-32
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	rbp
+
+
+	subq	$128,%rsp
+	andq	$-64,%rsp
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqu	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqu	%xmm0,(%rsp)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	je	.L128_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+.L128_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqu	(%rsp),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	(%rsp),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqu	%xmm14,(%rsp)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	decq	%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	.L128_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+.L128_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	.L128_enc_msg_x8_out
+
+.L128_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenclast	160(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	decq	%r10
+	jne	.L128_enc_msg_x8_loop2
+
+.L128_enc_msg_x8_out:
+	movq	%rbp,%rsp
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
+.globl	aes128gcmsiv_dec
+.hidden aes128gcmsiv_dec
+.type	aes128gcmsiv_dec,@function
+.align	16
+aes128gcmsiv_dec:
+.cfi_startproc	
+_CET_ENDBR
+	testq	$~15,%r9
+	jnz	.L128_dec_start
+	ret
+
+.L128_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+
+
+	vmovdqu	16(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	.L128_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	.L128_dec_loop1
+
+
+.align	64
+.L128_dec_loop1:
+	cmpq	$96,%r9
+	jb	.L128_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	.L128_dec_loop1
+
+.L128_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+.L128_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	.L128_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenclast	160(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	.L128_dec_loop2
+
+.L128_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_dec, .-aes128gcmsiv_dec
+.globl	aes128gcmsiv_ecb_enc_block
+.hidden aes128gcmsiv_ecb_enc_block
+.type	aes128gcmsiv_ecb_enc_block,@function
+.align	16
+aes128gcmsiv_ecb_enc_block:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	(%rdi),%xmm1
+
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenclast	160(%rdx),%xmm1,%xmm1
+
+	vmovdqa	%xmm1,(%rsi)
+
+	ret
+.cfi_endproc	
+.size	aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
+.globl	aes256gcmsiv_aes_ks_enc_x1
+.hidden aes256gcmsiv_aes_ks_enc_x1
+.type	aes256gcmsiv_aes_ks_enc_x1,@function
+.align	16
+aes256gcmsiv_aes_ks_enc_x1:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vmovdqa	(%rdi),%xmm8
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	16(%rcx),%xmm3
+	vpxor	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm1,(%rdx)
+	vmovdqu	%xmm3,16(%rdx)
+	vpxor	%xmm14,%xmm14,%xmm14
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,32(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,48(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,64(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,80(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,96(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,112(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,128(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,144(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,160(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,176(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,192(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,208(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenclast	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,224(%rdx)
+
+	vmovdqa	%xmm8,(%rsi)
+	ret
+.cfi_endproc	
+.size	aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
+.globl	aes256gcmsiv_ecb_enc_block
+.hidden aes256gcmsiv_ecb_enc_block
+.type	aes256gcmsiv_ecb_enc_block,@function
+.align	16
+aes256gcmsiv_ecb_enc_block:
+.cfi_startproc	
+_CET_ENDBR
+	vmovdqa	(%rdi),%xmm1
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenc	160(%rdx),%xmm1,%xmm1
+	vaesenc	176(%rdx),%xmm1,%xmm1
+	vaesenc	192(%rdx),%xmm1,%xmm1
+	vaesenc	208(%rdx),%xmm1,%xmm1
+	vaesenclast	224(%rdx),%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	ret
+.cfi_endproc	
+.size	aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
+.globl	aes256gcmsiv_enc_msg_x4
+.hidden aes256gcmsiv_enc_msg_x4
+.type	aes256gcmsiv_enc_msg_x4,@function
+.align	16
+aes256gcmsiv_enc_msg_x4:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	.L256_enc_msg_x4_start
+	ret
+
+.L256_enc_msg_x4_start:
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	.L256_enc_msg_x4_start2
+	addq	$1,%r8
+
+.L256_enc_msg_x4_start2:
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqa	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	.L256_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+.L256_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	.L256_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+.L256_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	.L256_enc_msg_x4_out
+
+.L256_enc_msg_x4_loop2:
+
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenc	160(%rcx),%xmm5,%xmm5
+	vaesenc	176(%rcx),%xmm5,%xmm5
+	vaesenc	192(%rcx),%xmm5,%xmm5
+	vaesenc	208(%rcx),%xmm5,%xmm5
+	vaesenclast	224(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	.L256_enc_msg_x4_loop2
+
+.L256_enc_msg_x4_out:
+	ret
+.cfi_endproc	
+.size	aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
+.globl	aes256gcmsiv_enc_msg_x8
+.hidden aes256gcmsiv_enc_msg_x8
+.type	aes256gcmsiv_enc_msg_x8,@function
+.align	16
+aes256gcmsiv_enc_msg_x8:
+.cfi_startproc	
+_CET_ENDBR
+	testq	%r8,%r8
+	jnz	.L256_enc_msg_x8_start
+	ret
+
+.L256_enc_msg_x8_start:
+
+	movq	%rsp,%r11
+	subq	$16,%r11
+	andq	$-64,%r11
+
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	.L256_enc_msg_x8_start2
+	addq	$1,%r8
+
+.L256_enc_msg_x8_start2:
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqa	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqa	%xmm0,(%r11)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	jz	.L256_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+.L256_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqa	(%r11),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqa	(%r11),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqa	%xmm14,(%r11)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	.L256_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+.L256_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	.L256_enc_msg_x8_out
+
+.L256_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenc	160(%rcx),%xmm1,%xmm1
+	vaesenc	176(%rcx),%xmm1,%xmm1
+	vaesenc	192(%rcx),%xmm1,%xmm1
+	vaesenc	208(%rcx),%xmm1,%xmm1
+	vaesenclast	224(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+	subq	$1,%r10
+	jnz	.L256_enc_msg_x8_loop2
+
+.L256_enc_msg_x8_out:
+	ret
+
+.cfi_endproc	
+.size	aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
+.globl	aes256gcmsiv_dec
+.hidden aes256gcmsiv_dec
+.type	aes256gcmsiv_dec,@function
+.align	16
+aes256gcmsiv_dec:
+.cfi_startproc	
+_CET_ENDBR
+	testq	$~15,%r9
+	jnz	.L256_dec_start
+	ret
+
+.L256_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+
+
+	vmovdqu	16(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	.L256_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	.L256_dec_loop1
+
+
+.align	64
+.L256_dec_loop1:
+	cmpq	$96,%r9
+	jb	.L256_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	.L256_dec_loop1
+
+.L256_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+.L256_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	.L256_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenc	160(%r8),%xmm2,%xmm2
+	vaesenc	176(%r8),%xmm2,%xmm2
+	vaesenc	192(%r8),%xmm2,%xmm2
+	vaesenc	208(%r8),%xmm2,%xmm2
+	vaesenclast	224(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	.L256_dec_loop2
+
+.L256_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	ret
+.cfi_endproc	
+.size	aes256gcmsiv_dec, .-aes256gcmsiv_dec
+.globl	aes256gcmsiv_kdf
+.hidden aes256gcmsiv_kdf
+.type	aes256gcmsiv_kdf,@function
+.align	16
+aes256gcmsiv_kdf:
+.cfi_startproc	
+_CET_ENDBR
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+	vmovdqa	and_mask(%rip),%xmm11
+	vmovdqa	one(%rip),%xmm8
+	vpshufd	$0x90,%xmm4,%xmm4
+	vpand	%xmm11,%xmm4,%xmm4
+	vpaddd	%xmm8,%xmm4,%xmm6
+	vpaddd	%xmm8,%xmm6,%xmm7
+	vpaddd	%xmm8,%xmm7,%xmm11
+	vpaddd	%xmm8,%xmm11,%xmm12
+	vpaddd	%xmm8,%xmm12,%xmm13
+
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	176(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	192(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	208(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	224(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm4,%xmm4
+	vaesenclast	%xmm2,%xmm6,%xmm6
+	vaesenclast	%xmm2,%xmm7,%xmm7
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	vmovdqa	%xmm6,16(%rsi)
+	vmovdqa	%xmm7,32(%rsi)
+	vmovdqa	%xmm11,48(%rsi)
+	vmovdqa	%xmm12,64(%rsi)
+	vmovdqa	%xmm13,80(%rsi)
+	ret
+.cfi_endproc	
+.size	aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
+#endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-win.asm b/gen/crypto/aes128gcmsiv-x86_64-win.asm
new file mode 100644
index 0000000..6691a2d
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-win.asm
@@ -0,0 +1,3302 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.rdata rdata align=8
+
+ALIGN	16
+one:
+	DQ	1,0
+two:
+	DQ	2,0
+three:
+	DQ	3,0
+four:
+	DQ	4,0
+five:
+	DQ	5,0
+six:
+	DQ	6,0
+seven:
+	DQ	7,0
+eight:
+	DQ	8,0
+
+OR_MASK:
+	DD	0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+	DQ	0x1,0xc200000000000000
+mask:
+	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+	DD	1,1,1,1
+con2:
+	DD	0x1b,0x1b,0x1b,0x1b
+con3:
+	DB	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+	DD	0,0xffffffff,0xffffffff,0xffffffff
+section	.text code align=64
+
+
+ALIGN	16
+GFMUL:
+
+	vpclmulqdq	xmm2,xmm0,xmm1,0x00
+	vpclmulqdq	xmm5,xmm0,xmm1,0x11
+	vpclmulqdq	xmm3,xmm0,xmm1,0x10
+	vpclmulqdq	xmm4,xmm0,xmm1,0x01
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm2,xmm2,xmm4
+	vpxor	xmm5,xmm5,xmm3
+
+	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
+	vpshufd	xmm4,xmm2,78
+	vpxor	xmm2,xmm3,xmm4
+
+	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
+	vpshufd	xmm4,xmm2,78
+	vpxor	xmm2,xmm3,xmm4
+
+	vpxor	xmm0,xmm2,xmm5
+	ret
+
+
+global	aesgcmsiv_htable_init
+
+ALIGN	16
+aesgcmsiv_htable_init:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_init:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm0,XMMWORD[rsi]
+	vmovdqa	xmm1,xmm0
+	vmovdqa	XMMWORD[rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[16+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[32+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[48+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[64+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[80+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[96+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[112+rdi],xmm0
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aesgcmsiv_htable_init:
+global	aesgcmsiv_htable6_init
+
+ALIGN	16
+aesgcmsiv_htable6_init:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesgcmsiv_htable6_init:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm0,XMMWORD[rsi]
+	vmovdqa	xmm1,xmm0
+	vmovdqa	XMMWORD[rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[16+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[32+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[48+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[64+rdi],xmm0
+	call	GFMUL
+	vmovdqa	XMMWORD[80+rdi],xmm0
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aesgcmsiv_htable6_init:
+global	aesgcmsiv_htable_polyval
+
+ALIGN	16
+aesgcmsiv_htable_polyval:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_polyval:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+_CET_ENDBR
+	test	rdx,rdx
+	jnz	NEAR $L$htable_polyval_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$htable_polyval_start:
+	vzeroall
+
+
+
+	mov	r11,rdx
+	and	r11,127
+
+	jz	NEAR $L$htable_polyval_no_prefix
+
+	vpxor	xmm9,xmm9,xmm9
+	vmovdqa	xmm1,XMMWORD[rcx]
+	sub	rdx,r11
+
+	sub	r11,16
+
+
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vpxor	xmm0,xmm0,xmm1
+
+	vpclmulqdq	xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
+	vpclmulqdq	xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
+	vpclmulqdq	xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
+	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+	lea	rsi,[16+rsi]
+	test	r11,r11
+	jnz	NEAR $L$htable_polyval_prefix_loop
+	jmp	NEAR $L$htable_polyval_prefix_complete
+
+
+ALIGN	64
+$L$htable_polyval_prefix_loop:
+	sub	r11,16
+
+	vmovdqu	xmm0,XMMWORD[rsi]
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+	test	r11,r11
+
+	lea	rsi,[16+rsi]
+
+	jnz	NEAR $L$htable_polyval_prefix_loop
+
+$L$htable_polyval_prefix_complete:
+	vpsrldq	xmm6,xmm5,8
+	vpslldq	xmm5,xmm5,8
+
+	vpxor	xmm9,xmm4,xmm6
+	vpxor	xmm1,xmm3,xmm5
+
+	jmp	NEAR $L$htable_polyval_main_loop
+
+$L$htable_polyval_no_prefix:
+
+
+
+
+	vpxor	xmm1,xmm1,xmm1
+	vmovdqa	xmm9,XMMWORD[rcx]
+
+ALIGN	64
+$L$htable_polyval_main_loop:
+	sub	rdx,0x80
+	jb	NEAR $L$htable_polyval_out
+
+	vmovdqu	xmm0,XMMWORD[112+rsi]
+
+	vpclmulqdq	xmm5,xmm0,XMMWORD[rdi],0x01
+	vpclmulqdq	xmm3,xmm0,XMMWORD[rdi],0x00
+	vpclmulqdq	xmm4,xmm0,XMMWORD[rdi],0x11
+	vpclmulqdq	xmm6,xmm0,XMMWORD[rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vmovdqu	xmm0,XMMWORD[96+rsi]
+	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+
+	vmovdqu	xmm0,XMMWORD[80+rsi]
+
+	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
+	vpalignr	xmm1,xmm1,xmm1,8
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vpxor	xmm1,xmm1,xmm7
+
+	vmovdqu	xmm0,XMMWORD[64+rsi]
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vmovdqu	xmm0,XMMWORD[48+rsi]
+
+	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
+	vpalignr	xmm1,xmm1,xmm1,8
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vpxor	xmm1,xmm1,xmm7
+
+	vmovdqu	xmm0,XMMWORD[32+rsi]
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vpxor	xmm1,xmm1,xmm9
+
+	vmovdqu	xmm0,XMMWORD[16+rsi]
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vpxor	xmm0,xmm0,xmm1
+
+	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x01
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x00
+	vpxor	xmm3,xmm3,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x11
+	vpxor	xmm4,xmm4,xmm6
+	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x10
+	vpxor	xmm5,xmm5,xmm6
+
+
+	vpsrldq	xmm6,xmm5,8
+	vpslldq	xmm5,xmm5,8
+
+	vpxor	xmm9,xmm4,xmm6
+	vpxor	xmm1,xmm3,xmm5
+
+	lea	rsi,[128+rsi]
+	jmp	NEAR $L$htable_polyval_main_loop
+
+
+
+$L$htable_polyval_out:
+	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
+	vpalignr	xmm1,xmm1,xmm1,8
+	vpxor	xmm1,xmm1,xmm6
+
+	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
+	vpalignr	xmm1,xmm1,xmm1,8
+	vpxor	xmm1,xmm1,xmm6
+	vpxor	xmm1,xmm1,xmm9
+
+	vmovdqu	XMMWORD[rcx],xmm1
+	vzeroupper
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aesgcmsiv_htable_polyval:
+global	aesgcmsiv_polyval_horner
+
+ALIGN	16
+aesgcmsiv_polyval_horner:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesgcmsiv_polyval_horner:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+_CET_ENDBR
+	test	rcx,rcx
+	jnz	NEAR $L$polyval_horner_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$polyval_horner_start:
+
+
+
+	xor	r10,r10
+	shl	rcx,4
+
+	vmovdqa	xmm1,XMMWORD[rsi]
+	vmovdqa	xmm0,XMMWORD[rdi]
+
+$L$polyval_horner_loop:
+	vpxor	xmm0,xmm0,XMMWORD[r10*1+rdx]
+	call	GFMUL
+
+	add	r10,16
+	cmp	rcx,r10
+	jne	NEAR $L$polyval_horner_loop
+
+
+	vmovdqa	XMMWORD[rdi],xmm0
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aesgcmsiv_polyval_horner:
+global	aes128gcmsiv_aes_ks
+
+ALIGN	16
+aes128gcmsiv_aes_ks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	vmovdqu	xmm1,XMMWORD[rdi]
+	vmovdqa	XMMWORD[rsi],xmm1
+
+	vmovdqa	xmm0,XMMWORD[con1]
+	vmovdqa	xmm15,XMMWORD[mask]
+
+	mov	rax,8
+
+$L$ks128_loop:
+	add	rsi,16
+	sub	rax,1
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm3,xmm1,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	XMMWORD[rsi],xmm1
+	jne	NEAR $L$ks128_loop
+
+	vmovdqa	xmm0,XMMWORD[con2]
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm3,xmm1,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	XMMWORD[16+rsi],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslldq	xmm3,xmm1,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpslldq	xmm3,xmm3,4
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	XMMWORD[32+rsi],xmm1
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_aes_ks:
+global	aes256gcmsiv_aes_ks
+
+ALIGN	16
+aes256gcmsiv_aes_ks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+_CET_ENDBR
+	vmovdqu	xmm1,XMMWORD[rdi]
+	vmovdqu	xmm3,XMMWORD[16+rdi]
+	vmovdqa	XMMWORD[rsi],xmm1
+	vmovdqa	XMMWORD[16+rsi],xmm3
+	vmovdqa	xmm0,XMMWORD[con1]
+	vmovdqa	xmm15,XMMWORD[mask]
+	vpxor	xmm14,xmm14,xmm14
+	mov	rax,6
+
+$L$ks256_loop:
+	add	rsi,32
+	sub	rax,1
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm4,xmm1,32
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm4,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	XMMWORD[rsi],xmm1
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpsllq	xmm4,xmm3,32
+	vpxor	xmm3,xmm3,xmm4
+	vpshufb	xmm4,xmm3,XMMWORD[con3]
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vmovdqa	XMMWORD[16+rsi],xmm3
+	jne	NEAR $L$ks256_loop
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpsllq	xmm4,xmm1,32
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm4,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vmovdqa	XMMWORD[32+rsi],xmm1
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+global	aes128gcmsiv_aes_ks_enc_x1
+
+ALIGN	16
+aes128gcmsiv_aes_ks_enc_x1:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm1,XMMWORD[rcx]
+	vmovdqa	xmm4,XMMWORD[rdi]
+
+	vmovdqa	XMMWORD[rdx],xmm1
+	vpxor	xmm4,xmm4,xmm1
+
+	vmovdqa	xmm0,XMMWORD[con1]
+	vmovdqa	xmm15,XMMWORD[mask]
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[16+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[32+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[48+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[64+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[80+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[96+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[112+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[128+rdx],xmm1
+
+
+	vmovdqa	xmm0,XMMWORD[con2]
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenc	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[144+rdx],xmm1
+
+	vpshufb	xmm2,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpsllq	xmm3,xmm1,32
+	vpxor	xmm1,xmm1,xmm3
+	vpshufb	xmm3,xmm1,XMMWORD[con3]
+	vpxor	xmm1,xmm1,xmm3
+	vpxor	xmm1,xmm1,xmm2
+
+	vaesenclast	xmm4,xmm4,xmm1
+	vmovdqa	XMMWORD[160+rdx],xmm1
+
+
+	vmovdqa	XMMWORD[rsi],xmm4
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
+global	aes128gcmsiv_kdf
+
+ALIGN	16
+aes128gcmsiv_kdf:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_kdf:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+
+
+
+
+	vmovdqa	xmm1,XMMWORD[rdx]
+	vmovdqa	xmm9,XMMWORD[rdi]
+	vmovdqa	xmm12,XMMWORD[and_mask]
+	vmovdqa	xmm13,XMMWORD[one]
+	vpshufd	xmm9,xmm9,0x90
+	vpand	xmm9,xmm9,xmm12
+	vpaddd	xmm10,xmm9,xmm13
+	vpaddd	xmm11,xmm10,xmm13
+	vpaddd	xmm12,xmm11,xmm13
+
+	vpxor	xmm9,xmm9,xmm1
+	vpxor	xmm10,xmm10,xmm1
+	vpxor	xmm11,xmm11,xmm1
+	vpxor	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm1,XMMWORD[16+rdx]
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm2,XMMWORD[32+rdx]
+	vaesenc	xmm9,xmm9,xmm2
+	vaesenc	xmm10,xmm10,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+
+	vmovdqa	xmm1,XMMWORD[48+rdx]
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm2,XMMWORD[64+rdx]
+	vaesenc	xmm9,xmm9,xmm2
+	vaesenc	xmm10,xmm10,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+
+	vmovdqa	xmm1,XMMWORD[80+rdx]
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm2,XMMWORD[96+rdx]
+	vaesenc	xmm9,xmm9,xmm2
+	vaesenc	xmm10,xmm10,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+
+	vmovdqa	xmm1,XMMWORD[112+rdx]
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm2,XMMWORD[128+rdx]
+	vaesenc	xmm9,xmm9,xmm2
+	vaesenc	xmm10,xmm10,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+
+	vmovdqa	xmm1,XMMWORD[144+rdx]
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+
+	vmovdqa	xmm2,XMMWORD[160+rdx]
+	vaesenclast	xmm9,xmm9,xmm2
+	vaesenclast	xmm10,xmm10,xmm2
+	vaesenclast	xmm11,xmm11,xmm2
+	vaesenclast	xmm12,xmm12,xmm2
+
+
+	vmovdqa	XMMWORD[rsi],xmm9
+	vmovdqa	XMMWORD[16+rsi],xmm10
+	vmovdqa	XMMWORD[32+rsi],xmm11
+	vmovdqa	XMMWORD[48+rsi],xmm12
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_kdf:
+global	aes128gcmsiv_enc_msg_x4
+
+ALIGN	16
+aes128gcmsiv_enc_msg_x4:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	test	r8,r8
+	jnz	NEAR $L$128_enc_msg_x4_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$128_enc_msg_x4_start:
+	push	r12
+
+	push	r13
+
+
+	shr	r8,4
+	mov	r10,r8
+	shl	r10,62
+	shr	r10,62
+
+
+	vmovdqa	xmm15,XMMWORD[rdx]
+	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
+
+	vmovdqu	xmm4,XMMWORD[four]
+	vmovdqa	xmm0,xmm15
+	vpaddd	xmm1,xmm15,XMMWORD[one]
+	vpaddd	xmm2,xmm15,XMMWORD[two]
+	vpaddd	xmm3,xmm15,XMMWORD[three]
+
+	shr	r8,2
+	je	NEAR $L$128_enc_msg_x4_check_remainder
+
+	sub	rsi,64
+	sub	rdi,64
+
+$L$128_enc_msg_x4_loop1:
+	add	rsi,64
+	add	rdi,64
+
+	vmovdqa	xmm5,xmm0
+	vmovdqa	xmm6,xmm1
+	vmovdqa	xmm7,xmm2
+	vmovdqa	xmm8,xmm3
+
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vpxor	xmm6,xmm6,XMMWORD[rcx]
+	vpxor	xmm7,xmm7,XMMWORD[rcx]
+	vpxor	xmm8,xmm8,XMMWORD[rcx]
+
+	vmovdqu	xmm12,XMMWORD[16+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm0,xmm0,xmm4
+	vmovdqu	xmm12,XMMWORD[32+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm1,xmm1,xmm4
+	vmovdqu	xmm12,XMMWORD[48+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm2,xmm2,xmm4
+	vmovdqu	xmm12,XMMWORD[64+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm3,xmm3,xmm4
+
+	vmovdqu	xmm12,XMMWORD[80+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[96+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[112+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[128+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[144+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[160+rcx]
+	vaesenclast	xmm5,xmm5,xmm12
+	vaesenclast	xmm6,xmm6,xmm12
+	vaesenclast	xmm7,xmm7,xmm12
+	vaesenclast	xmm8,xmm8,xmm12
+
+
+
+	vpxor	xmm5,xmm5,XMMWORD[rdi]
+	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
+	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
+	vpxor	xmm8,xmm8,XMMWORD[48+rdi]
+
+	sub	r8,1
+
+	vmovdqu	XMMWORD[rsi],xmm5
+	vmovdqu	XMMWORD[16+rsi],xmm6
+	vmovdqu	XMMWORD[32+rsi],xmm7
+	vmovdqu	XMMWORD[48+rsi],xmm8
+
+	jne	NEAR $L$128_enc_msg_x4_loop1
+
+	add	rsi,64
+	add	rdi,64
+
+$L$128_enc_msg_x4_check_remainder:
+	cmp	r10,0
+	je	NEAR $L$128_enc_msg_x4_out
+
+$L$128_enc_msg_x4_loop2:
+
+
+	vmovdqa	xmm5,xmm0
+	vpaddd	xmm0,xmm0,XMMWORD[one]
+
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
+	vaesenclast	xmm5,xmm5,XMMWORD[160+rcx]
+
+
+	vpxor	xmm5,xmm5,XMMWORD[rdi]
+	vmovdqu	XMMWORD[rsi],xmm5
+
+	add	rdi,16
+	add	rsi,16
+
+	sub	r10,1
+	jne	NEAR $L$128_enc_msg_x4_loop2
+
+$L$128_enc_msg_x4_out:
+	pop	r13
+
+	pop	r12
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x4:
+global	aes128gcmsiv_enc_msg_x8
+
+ALIGN	16
+aes128gcmsiv_enc_msg_x8:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	test	r8,r8
+	jnz	NEAR $L$128_enc_msg_x8_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$128_enc_msg_x8_start:
+	push	r12
+
+	push	r13
+
+	push	rbp
+
+	mov	rbp,rsp
+
+
+
+	sub	rsp,128
+	and	rsp,-64
+
+	shr	r8,4
+	mov	r10,r8
+	shl	r10,61
+	shr	r10,61
+
+
+	vmovdqu	xmm1,XMMWORD[rdx]
+	vpor	xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+	vpaddd	xmm0,xmm1,XMMWORD[seven]
+	vmovdqu	XMMWORD[rsp],xmm0
+	vpaddd	xmm9,xmm1,XMMWORD[one]
+	vpaddd	xmm10,xmm1,XMMWORD[two]
+	vpaddd	xmm11,xmm1,XMMWORD[three]
+	vpaddd	xmm12,xmm1,XMMWORD[four]
+	vpaddd	xmm13,xmm1,XMMWORD[five]
+	vpaddd	xmm14,xmm1,XMMWORD[six]
+	vmovdqa	xmm0,xmm1
+
+	shr	r8,3
+	je	NEAR $L$128_enc_msg_x8_check_remainder
+
+	sub	rsi,128
+	sub	rdi,128
+
+$L$128_enc_msg_x8_loop1:
+	add	rsi,128
+	add	rdi,128
+
+	vmovdqa	xmm1,xmm0
+	vmovdqa	xmm2,xmm9
+	vmovdqa	xmm3,xmm10
+	vmovdqa	xmm4,xmm11
+	vmovdqa	xmm5,xmm12
+	vmovdqa	xmm6,xmm13
+	vmovdqa	xmm7,xmm14
+
+	vmovdqu	xmm8,XMMWORD[rsp]
+
+	vpxor	xmm1,xmm1,XMMWORD[rcx]
+	vpxor	xmm2,xmm2,XMMWORD[rcx]
+	vpxor	xmm3,xmm3,XMMWORD[rcx]
+	vpxor	xmm4,xmm4,XMMWORD[rcx]
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vpxor	xmm6,xmm6,XMMWORD[rcx]
+	vpxor	xmm7,xmm7,XMMWORD[rcx]
+	vpxor	xmm8,xmm8,XMMWORD[rcx]
+
+	vmovdqu	xmm15,XMMWORD[16+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[rsp]
+	vpaddd	xmm14,xmm14,XMMWORD[eight]
+	vmovdqu	XMMWORD[rsp],xmm14
+	vmovdqu	xmm15,XMMWORD[32+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpsubd	xmm14,xmm14,XMMWORD[one]
+	vmovdqu	xmm15,XMMWORD[48+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm0,xmm0,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[64+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm9,xmm9,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[80+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm10,xmm10,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[96+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm11,xmm11,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[112+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm12,xmm12,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[128+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm13,xmm13,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[144+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[160+rcx]
+	vaesenclast	xmm1,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm15
+	vaesenclast	xmm3,xmm3,xmm15
+	vaesenclast	xmm4,xmm4,xmm15
+	vaesenclast	xmm5,xmm5,xmm15
+	vaesenclast	xmm6,xmm6,xmm15
+	vaesenclast	xmm7,xmm7,xmm15
+	vaesenclast	xmm8,xmm8,xmm15
+
+
+
+	vpxor	xmm1,xmm1,XMMWORD[rdi]
+	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
+	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
+	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
+	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
+	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
+	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
+	vpxor	xmm8,xmm8,XMMWORD[112+rdi]
+
+	dec	r8
+
+	vmovdqu	XMMWORD[rsi],xmm1
+	vmovdqu	XMMWORD[16+rsi],xmm2
+	vmovdqu	XMMWORD[32+rsi],xmm3
+	vmovdqu	XMMWORD[48+rsi],xmm4
+	vmovdqu	XMMWORD[64+rsi],xmm5
+	vmovdqu	XMMWORD[80+rsi],xmm6
+	vmovdqu	XMMWORD[96+rsi],xmm7
+	vmovdqu	XMMWORD[112+rsi],xmm8
+
+	jne	NEAR $L$128_enc_msg_x8_loop1
+
+	add	rsi,128
+	add	rdi,128
+
+$L$128_enc_msg_x8_check_remainder:
+	cmp	r10,0
+	je	NEAR $L$128_enc_msg_x8_out
+
+$L$128_enc_msg_x8_loop2:
+
+
+	vmovdqa	xmm1,xmm0
+	vpaddd	xmm0,xmm0,XMMWORD[one]
+
+	vpxor	xmm1,xmm1,XMMWORD[rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
+	vaesenclast	xmm1,xmm1,XMMWORD[160+rcx]
+
+
+	vpxor	xmm1,xmm1,XMMWORD[rdi]
+
+	vmovdqu	XMMWORD[rsi],xmm1
+
+	add	rdi,16
+	add	rsi,16
+
+	dec	r10
+	jne	NEAR $L$128_enc_msg_x8_loop2
+
+$L$128_enc_msg_x8_out:
+	mov	rsp,rbp
+
+	pop	rbp
+
+	pop	r13
+
+	pop	r12
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x8:
+global	aes128gcmsiv_dec
+
+ALIGN	16
+aes128gcmsiv_dec:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_dec:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	test	r9,~15
+	jnz	NEAR $L$128_dec_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$128_dec_start:
+	vzeroupper
+	vmovdqa	xmm0,XMMWORD[rdx]
+
+
+	vmovdqu	xmm15,XMMWORD[16+rdx]
+	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
+	mov	rax,rdx
+
+	lea	rax,[32+rax]
+	lea	rcx,[32+rcx]
+
+	and	r9,~15
+
+
+	cmp	r9,96
+	jb	NEAR $L$128_dec_loop2
+
+
+	sub	r9,96
+	vmovdqa	xmm7,xmm15
+	vpaddd	xmm8,xmm7,XMMWORD[one]
+	vpaddd	xmm9,xmm7,XMMWORD[two]
+	vpaddd	xmm10,xmm9,XMMWORD[one]
+	vpaddd	xmm11,xmm9,XMMWORD[two]
+	vpaddd	xmm12,xmm11,XMMWORD[one]
+	vpaddd	xmm15,xmm11,XMMWORD[two]
+
+	vpxor	xmm7,xmm7,XMMWORD[r8]
+	vpxor	xmm8,xmm8,XMMWORD[r8]
+	vpxor	xmm9,xmm9,XMMWORD[r8]
+	vpxor	xmm10,xmm10,XMMWORD[r8]
+	vpxor	xmm11,xmm11,XMMWORD[r8]
+	vpxor	xmm12,xmm12,XMMWORD[r8]
+
+	vmovdqu	xmm4,XMMWORD[16+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[32+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[48+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[64+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[80+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[96+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[112+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[128+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[144+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[160+r8]
+	vaesenclast	xmm7,xmm7,xmm4
+	vaesenclast	xmm8,xmm8,xmm4
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm4
+	vaesenclast	xmm11,xmm11,xmm4
+	vaesenclast	xmm12,xmm12,xmm4
+
+
+	vpxor	xmm7,xmm7,XMMWORD[rdi]
+	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
+	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
+	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
+	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
+	vpxor	xmm12,xmm12,XMMWORD[80+rdi]
+
+	vmovdqu	XMMWORD[rsi],xmm7
+	vmovdqu	XMMWORD[16+rsi],xmm8
+	vmovdqu	XMMWORD[32+rsi],xmm9
+	vmovdqu	XMMWORD[48+rsi],xmm10
+	vmovdqu	XMMWORD[64+rsi],xmm11
+	vmovdqu	XMMWORD[80+rsi],xmm12
+
+	add	rdi,96
+	add	rsi,96
+	jmp	NEAR $L$128_dec_loop1
+
+
+ALIGN	64
+$L$128_dec_loop1:
+	cmp	r9,96
+	jb	NEAR $L$128_dec_finish_96
+	sub	r9,96
+
+	vmovdqa	xmm6,xmm12
+	vmovdqa	XMMWORD[(16-32)+rax],xmm11
+	vmovdqa	XMMWORD[(32-32)+rax],xmm10
+	vmovdqa	XMMWORD[(48-32)+rax],xmm9
+	vmovdqa	XMMWORD[(64-32)+rax],xmm8
+	vmovdqa	XMMWORD[(80-32)+rax],xmm7
+
+	vmovdqa	xmm7,xmm15
+	vpaddd	xmm8,xmm7,XMMWORD[one]
+	vpaddd	xmm9,xmm7,XMMWORD[two]
+	vpaddd	xmm10,xmm9,XMMWORD[one]
+	vpaddd	xmm11,xmm9,XMMWORD[two]
+	vpaddd	xmm12,xmm11,XMMWORD[one]
+	vpaddd	xmm15,xmm11,XMMWORD[two]
+
+	vmovdqa	xmm4,XMMWORD[r8]
+	vpxor	xmm7,xmm7,xmm4
+	vpxor	xmm8,xmm8,xmm4
+	vpxor	xmm9,xmm9,xmm4
+	vpxor	xmm10,xmm10,xmm4
+	vpxor	xmm11,xmm11,xmm4
+	vpxor	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
+	vpclmulqdq	xmm2,xmm6,xmm4,0x11
+	vpclmulqdq	xmm3,xmm6,xmm4,0x00
+	vpclmulqdq	xmm1,xmm6,xmm4,0x01
+	vpclmulqdq	xmm4,xmm6,xmm4,0x10
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm4,XMMWORD[16+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[((-16))+rax]
+	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[32+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[rax]
+	vmovdqu	xmm13,XMMWORD[rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[48+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[16+rax]
+	vmovdqu	xmm13,XMMWORD[16+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[64+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[32+rax]
+	vmovdqu	xmm13,XMMWORD[32+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[80+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[96+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[112+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+
+	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
+	vpxor	xmm6,xmm6,xmm0
+	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm5,0x01
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x10
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm4,XMMWORD[128+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+
+	vpsrldq	xmm4,xmm1,8
+	vpxor	xmm5,xmm2,xmm4
+	vpslldq	xmm4,xmm1,8
+	vpxor	xmm0,xmm3,xmm4
+
+	vmovdqa	xmm3,XMMWORD[poly]
+
+	vmovdqu	xmm4,XMMWORD[144+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[160+r8]
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpxor	xmm4,xmm6,XMMWORD[rdi]
+	vaesenclast	xmm7,xmm7,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
+	vaesenclast	xmm8,xmm8,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
+	vaesenclast	xmm9,xmm9,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
+	vaesenclast	xmm10,xmm10,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
+	vaesenclast	xmm11,xmm11,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
+	vaesenclast	xmm12,xmm12,xmm4
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vmovdqu	XMMWORD[rsi],xmm7
+	vmovdqu	XMMWORD[16+rsi],xmm8
+	vmovdqu	XMMWORD[32+rsi],xmm9
+	vmovdqu	XMMWORD[48+rsi],xmm10
+	vmovdqu	XMMWORD[64+rsi],xmm11
+	vmovdqu	XMMWORD[80+rsi],xmm12
+
+	vpxor	xmm0,xmm0,xmm5
+
+	lea	rdi,[96+rdi]
+	lea	rsi,[96+rsi]
+	jmp	NEAR $L$128_dec_loop1
+
+$L$128_dec_finish_96:
+	vmovdqa	xmm6,xmm12
+	vmovdqa	XMMWORD[(16-32)+rax],xmm11
+	vmovdqa	XMMWORD[(32-32)+rax],xmm10
+	vmovdqa	XMMWORD[(48-32)+rax],xmm9
+	vmovdqa	XMMWORD[(64-32)+rax],xmm8
+	vmovdqa	XMMWORD[(80-32)+rax],xmm7
+
+	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
+	vpclmulqdq	xmm1,xmm6,xmm4,0x10
+	vpclmulqdq	xmm2,xmm6,xmm4,0x11
+	vpclmulqdq	xmm3,xmm6,xmm4,0x00
+	vpclmulqdq	xmm4,xmm6,xmm4,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[((-16))+rax]
+	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[rax]
+	vmovdqu	xmm13,XMMWORD[rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[16+rax]
+	vmovdqu	xmm13,XMMWORD[16+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[32+rax]
+	vmovdqu	xmm13,XMMWORD[32+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
+	vpxor	xmm6,xmm6,xmm0
+	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
+	vpclmulqdq	xmm4,xmm6,xmm5,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrldq	xmm4,xmm1,8
+	vpxor	xmm5,xmm2,xmm4
+	vpslldq	xmm4,xmm1,8
+	vpxor	xmm0,xmm3,xmm4
+
+	vmovdqa	xmm3,XMMWORD[poly]
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpxor	xmm0,xmm0,xmm5
+
+$L$128_dec_loop2:
+
+
+
+	cmp	r9,16
+	jb	NEAR $L$128_dec_out
+	sub	r9,16
+
+	vmovdqa	xmm2,xmm15
+	vpaddd	xmm15,xmm15,XMMWORD[one]
+
+	vpxor	xmm2,xmm2,XMMWORD[r8]
+	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
+	vaesenclast	xmm2,xmm2,XMMWORD[160+r8]
+	vpxor	xmm2,xmm2,XMMWORD[rdi]
+	vmovdqu	XMMWORD[rsi],xmm2
+	add	rdi,16
+	add	rsi,16
+
+	vpxor	xmm0,xmm0,xmm2
+	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
+	call	GFMUL
+
+	jmp	NEAR $L$128_dec_loop2
+
+$L$128_dec_out:
+	vmovdqu	XMMWORD[rdx],xmm0
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_dec:
+global	aes128gcmsiv_ecb_enc_block
+
+ALIGN	16
+aes128gcmsiv_ecb_enc_block:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm1,XMMWORD[rdi]
+
+	vpxor	xmm1,xmm1,XMMWORD[rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
+	vaesenclast	xmm1,xmm1,XMMWORD[160+rdx]
+
+	vmovdqa	XMMWORD[rsi],xmm1
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes128gcmsiv_ecb_enc_block:
+global	aes256gcmsiv_aes_ks_enc_x1
+
+ALIGN	16
+aes256gcmsiv_aes_ks_enc_x1:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm0,XMMWORD[con1]
+	vmovdqa	xmm15,XMMWORD[mask]
+	vmovdqa	xmm8,XMMWORD[rdi]
+	vmovdqa	xmm1,XMMWORD[rcx]
+	vmovdqa	xmm3,XMMWORD[16+rcx]
+	vpxor	xmm8,xmm8,xmm1
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[rdx],xmm1
+	vmovdqu	XMMWORD[16+rdx],xmm3
+	vpxor	xmm14,xmm14,xmm14
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[32+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[48+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[64+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[80+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[96+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[112+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[128+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[144+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[160+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[176+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslld	xmm0,xmm0,1
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenc	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[192+rdx],xmm1
+
+	vpshufd	xmm2,xmm1,0xff
+	vaesenclast	xmm2,xmm2,xmm14
+	vpslldq	xmm4,xmm3,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm3,xmm3,xmm4
+	vpxor	xmm3,xmm3,xmm2
+	vaesenc	xmm8,xmm8,xmm3
+	vmovdqu	XMMWORD[208+rdx],xmm3
+
+	vpshufb	xmm2,xmm3,xmm15
+	vaesenclast	xmm2,xmm2,xmm0
+	vpslldq	xmm4,xmm1,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpslldq	xmm4,xmm4,4
+	vpxor	xmm1,xmm1,xmm4
+	vpxor	xmm1,xmm1,xmm2
+	vaesenclast	xmm8,xmm8,xmm1
+	vmovdqu	XMMWORD[224+rdx],xmm1
+
+	vmovdqa	XMMWORD[rsi],xmm8
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
+global	aes256gcmsiv_ecb_enc_block
+
+ALIGN	16
+aes256gcmsiv_ecb_enc_block:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+	vmovdqa	xmm1,XMMWORD[rdi]
+	vpxor	xmm1,xmm1,XMMWORD[rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[160+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[176+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[192+rdx]
+	vaesenc	xmm1,xmm1,XMMWORD[208+rdx]
+	vaesenclast	xmm1,xmm1,XMMWORD[224+rdx]
+	vmovdqa	XMMWORD[rsi],xmm1
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes256gcmsiv_ecb_enc_block:
+global	aes256gcmsiv_enc_msg_x4
+
+ALIGN	16
+aes256gcmsiv_enc_msg_x4:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	test	r8,r8
+	jnz	NEAR $L$256_enc_msg_x4_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$256_enc_msg_x4_start:
+	mov	r10,r8
+	shr	r8,4
+	shl	r10,60
+	jz	NEAR $L$256_enc_msg_x4_start2
+	add	r8,1
+
+$L$256_enc_msg_x4_start2:
+	mov	r10,r8
+	shl	r10,62
+	shr	r10,62
+
+
+	vmovdqa	xmm15,XMMWORD[rdx]
+	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
+
+	vmovdqa	xmm4,XMMWORD[four]
+	vmovdqa	xmm0,xmm15
+	vpaddd	xmm1,xmm15,XMMWORD[one]
+	vpaddd	xmm2,xmm15,XMMWORD[two]
+	vpaddd	xmm3,xmm15,XMMWORD[three]
+
+	shr	r8,2
+	je	NEAR $L$256_enc_msg_x4_check_remainder
+
+	sub	rsi,64
+	sub	rdi,64
+
+$L$256_enc_msg_x4_loop1:
+	add	rsi,64
+	add	rdi,64
+
+	vmovdqa	xmm5,xmm0
+	vmovdqa	xmm6,xmm1
+	vmovdqa	xmm7,xmm2
+	vmovdqa	xmm8,xmm3
+
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vpxor	xmm6,xmm6,XMMWORD[rcx]
+	vpxor	xmm7,xmm7,XMMWORD[rcx]
+	vpxor	xmm8,xmm8,XMMWORD[rcx]
+
+	vmovdqu	xmm12,XMMWORD[16+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm0,xmm0,xmm4
+	vmovdqu	xmm12,XMMWORD[32+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm1,xmm1,xmm4
+	vmovdqu	xmm12,XMMWORD[48+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm2,xmm2,xmm4
+	vmovdqu	xmm12,XMMWORD[64+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vpaddd	xmm3,xmm3,xmm4
+
+	vmovdqu	xmm12,XMMWORD[80+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[96+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[112+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[128+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[144+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[160+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[176+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[192+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[208+rcx]
+	vaesenc	xmm5,xmm5,xmm12
+	vaesenc	xmm6,xmm6,xmm12
+	vaesenc	xmm7,xmm7,xmm12
+	vaesenc	xmm8,xmm8,xmm12
+
+	vmovdqu	xmm12,XMMWORD[224+rcx]
+	vaesenclast	xmm5,xmm5,xmm12
+	vaesenclast	xmm6,xmm6,xmm12
+	vaesenclast	xmm7,xmm7,xmm12
+	vaesenclast	xmm8,xmm8,xmm12
+
+
+
+	vpxor	xmm5,xmm5,XMMWORD[rdi]
+	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
+	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
+	vpxor	xmm8,xmm8,XMMWORD[48+rdi]
+
+	sub	r8,1
+
+	vmovdqu	XMMWORD[rsi],xmm5
+	vmovdqu	XMMWORD[16+rsi],xmm6
+	vmovdqu	XMMWORD[32+rsi],xmm7
+	vmovdqu	XMMWORD[48+rsi],xmm8
+
+	jne	NEAR $L$256_enc_msg_x4_loop1
+
+	add	rsi,64
+	add	rdi,64
+
+$L$256_enc_msg_x4_check_remainder:
+	cmp	r10,0
+	je	NEAR $L$256_enc_msg_x4_out
+
+$L$256_enc_msg_x4_loop2:
+
+
+
+	vmovdqa	xmm5,xmm0
+	vpaddd	xmm0,xmm0,XMMWORD[one]
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[160+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[176+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[192+rcx]
+	vaesenc	xmm5,xmm5,XMMWORD[208+rcx]
+	vaesenclast	xmm5,xmm5,XMMWORD[224+rcx]
+
+
+	vpxor	xmm5,xmm5,XMMWORD[rdi]
+
+	vmovdqu	XMMWORD[rsi],xmm5
+
+	add	rdi,16
+	add	rsi,16
+
+	sub	r10,1
+	jne	NEAR $L$256_enc_msg_x4_loop2
+
+$L$256_enc_msg_x4_out:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x4:
+global	aes256gcmsiv_enc_msg_x8
+
+ALIGN	16
+aes256gcmsiv_enc_msg_x8:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	test	r8,r8
+	jnz	NEAR $L$256_enc_msg_x8_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$256_enc_msg_x8_start:
+
+	mov	r11,rsp
+	sub	r11,16
+	and	r11,-64
+
+	mov	r10,r8
+	shr	r8,4
+	shl	r10,60
+	jz	NEAR $L$256_enc_msg_x8_start2
+	add	r8,1
+
+$L$256_enc_msg_x8_start2:
+	mov	r10,r8
+	shl	r10,61
+	shr	r10,61
+
+
+	vmovdqa	xmm1,XMMWORD[rdx]
+	vpor	xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+	vpaddd	xmm0,xmm1,XMMWORD[seven]
+	vmovdqa	XMMWORD[r11],xmm0
+	vpaddd	xmm9,xmm1,XMMWORD[one]
+	vpaddd	xmm10,xmm1,XMMWORD[two]
+	vpaddd	xmm11,xmm1,XMMWORD[three]
+	vpaddd	xmm12,xmm1,XMMWORD[four]
+	vpaddd	xmm13,xmm1,XMMWORD[five]
+	vpaddd	xmm14,xmm1,XMMWORD[six]
+	vmovdqa	xmm0,xmm1
+
+	shr	r8,3
+	jz	NEAR $L$256_enc_msg_x8_check_remainder
+
+	sub	rsi,128
+	sub	rdi,128
+
+$L$256_enc_msg_x8_loop1:
+	add	rsi,128
+	add	rdi,128
+
+	vmovdqa	xmm1,xmm0
+	vmovdqa	xmm2,xmm9
+	vmovdqa	xmm3,xmm10
+	vmovdqa	xmm4,xmm11
+	vmovdqa	xmm5,xmm12
+	vmovdqa	xmm6,xmm13
+	vmovdqa	xmm7,xmm14
+
+	vmovdqa	xmm8,XMMWORD[r11]
+
+	vpxor	xmm1,xmm1,XMMWORD[rcx]
+	vpxor	xmm2,xmm2,XMMWORD[rcx]
+	vpxor	xmm3,xmm3,XMMWORD[rcx]
+	vpxor	xmm4,xmm4,XMMWORD[rcx]
+	vpxor	xmm5,xmm5,XMMWORD[rcx]
+	vpxor	xmm6,xmm6,XMMWORD[rcx]
+	vpxor	xmm7,xmm7,XMMWORD[rcx]
+	vpxor	xmm8,xmm8,XMMWORD[rcx]
+
+	vmovdqu	xmm15,XMMWORD[16+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqa	xmm14,XMMWORD[r11]
+	vpaddd	xmm14,xmm14,XMMWORD[eight]
+	vmovdqa	XMMWORD[r11],xmm14
+	vmovdqu	xmm15,XMMWORD[32+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpsubd	xmm14,xmm14,XMMWORD[one]
+	vmovdqu	xmm15,XMMWORD[48+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm0,xmm0,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[64+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm9,xmm9,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[80+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm10,xmm10,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[96+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm11,xmm11,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[112+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm12,xmm12,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[128+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vpaddd	xmm13,xmm13,XMMWORD[eight]
+	vmovdqu	xmm15,XMMWORD[144+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[160+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[176+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[192+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[208+rcx]
+	vaesenc	xmm1,xmm1,xmm15
+	vaesenc	xmm2,xmm2,xmm15
+	vaesenc	xmm3,xmm3,xmm15
+	vaesenc	xmm4,xmm4,xmm15
+	vaesenc	xmm5,xmm5,xmm15
+	vaesenc	xmm6,xmm6,xmm15
+	vaesenc	xmm7,xmm7,xmm15
+	vaesenc	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm15,XMMWORD[224+rcx]
+	vaesenclast	xmm1,xmm1,xmm15
+	vaesenclast	xmm2,xmm2,xmm15
+	vaesenclast	xmm3,xmm3,xmm15
+	vaesenclast	xmm4,xmm4,xmm15
+	vaesenclast	xmm5,xmm5,xmm15
+	vaesenclast	xmm6,xmm6,xmm15
+	vaesenclast	xmm7,xmm7,xmm15
+	vaesenclast	xmm8,xmm8,xmm15
+
+
+
+	vpxor	xmm1,xmm1,XMMWORD[rdi]
+	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
+	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
+	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
+	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
+	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
+	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
+	vpxor	xmm8,xmm8,XMMWORD[112+rdi]
+
+	sub	r8,1
+
+	vmovdqu	XMMWORD[rsi],xmm1
+	vmovdqu	XMMWORD[16+rsi],xmm2
+	vmovdqu	XMMWORD[32+rsi],xmm3
+	vmovdqu	XMMWORD[48+rsi],xmm4
+	vmovdqu	XMMWORD[64+rsi],xmm5
+	vmovdqu	XMMWORD[80+rsi],xmm6
+	vmovdqu	XMMWORD[96+rsi],xmm7
+	vmovdqu	XMMWORD[112+rsi],xmm8
+
+	jne	NEAR $L$256_enc_msg_x8_loop1
+
+	add	rsi,128
+	add	rdi,128
+
+$L$256_enc_msg_x8_check_remainder:
+	cmp	r10,0
+	je	NEAR $L$256_enc_msg_x8_out
+
+$L$256_enc_msg_x8_loop2:
+
+
+	vmovdqa	xmm1,xmm0
+	vpaddd	xmm0,xmm0,XMMWORD[one]
+
+	vpxor	xmm1,xmm1,XMMWORD[rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[160+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[176+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[192+rcx]
+	vaesenc	xmm1,xmm1,XMMWORD[208+rcx]
+	vaesenclast	xmm1,xmm1,XMMWORD[224+rcx]
+
+
+	vpxor	xmm1,xmm1,XMMWORD[rdi]
+
+	vmovdqu	XMMWORD[rsi],xmm1
+
+	add	rdi,16
+	add	rsi,16
+	sub	r10,1
+	jnz	NEAR $L$256_enc_msg_x8_loop2
+
+$L$256_enc_msg_x8_out:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x8:
+global	aes256gcmsiv_dec
+
+ALIGN	16
+aes256gcmsiv_dec:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_dec:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	test	r9,~15
+	jnz	NEAR $L$256_dec_start
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$256_dec_start:
+	vzeroupper
+	vmovdqa	xmm0,XMMWORD[rdx]
+
+
+	vmovdqu	xmm15,XMMWORD[16+rdx]
+	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
+	mov	rax,rdx
+
+	lea	rax,[32+rax]
+	lea	rcx,[32+rcx]
+
+	and	r9,~15
+
+
+	cmp	r9,96
+	jb	NEAR $L$256_dec_loop2
+
+
+	sub	r9,96
+	vmovdqa	xmm7,xmm15
+	vpaddd	xmm8,xmm7,XMMWORD[one]
+	vpaddd	xmm9,xmm7,XMMWORD[two]
+	vpaddd	xmm10,xmm9,XMMWORD[one]
+	vpaddd	xmm11,xmm9,XMMWORD[two]
+	vpaddd	xmm12,xmm11,XMMWORD[one]
+	vpaddd	xmm15,xmm11,XMMWORD[two]
+
+	vpxor	xmm7,xmm7,XMMWORD[r8]
+	vpxor	xmm8,xmm8,XMMWORD[r8]
+	vpxor	xmm9,xmm9,XMMWORD[r8]
+	vpxor	xmm10,xmm10,XMMWORD[r8]
+	vpxor	xmm11,xmm11,XMMWORD[r8]
+	vpxor	xmm12,xmm12,XMMWORD[r8]
+
+	vmovdqu	xmm4,XMMWORD[16+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[32+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[48+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[64+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[80+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[96+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[112+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[128+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[144+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[160+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[176+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[192+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[208+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[224+r8]
+	vaesenclast	xmm7,xmm7,xmm4
+	vaesenclast	xmm8,xmm8,xmm4
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm4
+	vaesenclast	xmm11,xmm11,xmm4
+	vaesenclast	xmm12,xmm12,xmm4
+
+
+	vpxor	xmm7,xmm7,XMMWORD[rdi]
+	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
+	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
+	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
+	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
+	vpxor	xmm12,xmm12,XMMWORD[80+rdi]
+
+	vmovdqu	XMMWORD[rsi],xmm7
+	vmovdqu	XMMWORD[16+rsi],xmm8
+	vmovdqu	XMMWORD[32+rsi],xmm9
+	vmovdqu	XMMWORD[48+rsi],xmm10
+	vmovdqu	XMMWORD[64+rsi],xmm11
+	vmovdqu	XMMWORD[80+rsi],xmm12
+
+	add	rdi,96
+	add	rsi,96
+	jmp	NEAR $L$256_dec_loop1
+
+
+ALIGN	64
+$L$256_dec_loop1:
+	cmp	r9,96
+	jb	NEAR $L$256_dec_finish_96
+	sub	r9,96
+
+	vmovdqa	xmm6,xmm12
+	vmovdqa	XMMWORD[(16-32)+rax],xmm11
+	vmovdqa	XMMWORD[(32-32)+rax],xmm10
+	vmovdqa	XMMWORD[(48-32)+rax],xmm9
+	vmovdqa	XMMWORD[(64-32)+rax],xmm8
+	vmovdqa	XMMWORD[(80-32)+rax],xmm7
+
+	vmovdqa	xmm7,xmm15
+	vpaddd	xmm8,xmm7,XMMWORD[one]
+	vpaddd	xmm9,xmm7,XMMWORD[two]
+	vpaddd	xmm10,xmm9,XMMWORD[one]
+	vpaddd	xmm11,xmm9,XMMWORD[two]
+	vpaddd	xmm12,xmm11,XMMWORD[one]
+	vpaddd	xmm15,xmm11,XMMWORD[two]
+
+	vmovdqa	xmm4,XMMWORD[r8]
+	vpxor	xmm7,xmm7,xmm4
+	vpxor	xmm8,xmm8,xmm4
+	vpxor	xmm9,xmm9,xmm4
+	vpxor	xmm10,xmm10,xmm4
+	vpxor	xmm11,xmm11,xmm4
+	vpxor	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
+	vpclmulqdq	xmm2,xmm6,xmm4,0x11
+	vpclmulqdq	xmm3,xmm6,xmm4,0x00
+	vpclmulqdq	xmm1,xmm6,xmm4,0x01
+	vpclmulqdq	xmm4,xmm6,xmm4,0x10
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm4,XMMWORD[16+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[((-16))+rax]
+	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[32+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[rax]
+	vmovdqu	xmm13,XMMWORD[rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[48+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[16+rax]
+	vmovdqu	xmm13,XMMWORD[16+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[64+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[32+rax]
+	vmovdqu	xmm13,XMMWORD[32+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm4,XMMWORD[80+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[96+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[112+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+
+	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
+	vpxor	xmm6,xmm6,xmm0
+	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm5,0x01
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x10
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm4,XMMWORD[128+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+
+	vpsrldq	xmm4,xmm1,8
+	vpxor	xmm5,xmm2,xmm4
+	vpslldq	xmm4,xmm1,8
+	vpxor	xmm0,xmm3,xmm4
+
+	vmovdqa	xmm3,XMMWORD[poly]
+
+	vmovdqu	xmm4,XMMWORD[144+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[160+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[176+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[192+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm4,XMMWORD[208+r8]
+	vaesenc	xmm7,xmm7,xmm4
+	vaesenc	xmm8,xmm8,xmm4
+	vaesenc	xmm9,xmm9,xmm4
+	vaesenc	xmm10,xmm10,xmm4
+	vaesenc	xmm11,xmm11,xmm4
+	vaesenc	xmm12,xmm12,xmm4
+
+	vmovdqu	xmm6,XMMWORD[224+r8]
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpxor	xmm4,xmm6,XMMWORD[rdi]
+	vaesenclast	xmm7,xmm7,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
+	vaesenclast	xmm8,xmm8,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
+	vaesenclast	xmm9,xmm9,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
+	vaesenclast	xmm10,xmm10,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
+	vaesenclast	xmm11,xmm11,xmm4
+	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
+	vaesenclast	xmm12,xmm12,xmm4
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vmovdqu	XMMWORD[rsi],xmm7
+	vmovdqu	XMMWORD[16+rsi],xmm8
+	vmovdqu	XMMWORD[32+rsi],xmm9
+	vmovdqu	XMMWORD[48+rsi],xmm10
+	vmovdqu	XMMWORD[64+rsi],xmm11
+	vmovdqu	XMMWORD[80+rsi],xmm12
+
+	vpxor	xmm0,xmm0,xmm5
+
+	lea	rdi,[96+rdi]
+	lea	rsi,[96+rsi]
+	jmp	NEAR $L$256_dec_loop1
+
+$L$256_dec_finish_96:
+	vmovdqa	xmm6,xmm12
+	vmovdqa	XMMWORD[(16-32)+rax],xmm11
+	vmovdqa	XMMWORD[(32-32)+rax],xmm10
+	vmovdqa	XMMWORD[(48-32)+rax],xmm9
+	vmovdqa	XMMWORD[(64-32)+rax],xmm8
+	vmovdqa	XMMWORD[(80-32)+rax],xmm7
+
+	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
+	vpclmulqdq	xmm1,xmm6,xmm4,0x10
+	vpclmulqdq	xmm2,xmm6,xmm4,0x11
+	vpclmulqdq	xmm3,xmm6,xmm4,0x00
+	vpclmulqdq	xmm4,xmm6,xmm4,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[((-16))+rax]
+	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[rax]
+	vmovdqu	xmm13,XMMWORD[rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[16+rax]
+	vmovdqu	xmm13,XMMWORD[16+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vmovdqu	xmm6,XMMWORD[32+rax]
+	vmovdqu	xmm13,XMMWORD[32+rcx]
+
+	vpclmulqdq	xmm4,xmm6,xmm13,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm13,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+
+	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
+	vpxor	xmm6,xmm6,xmm0
+	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
+	vpclmulqdq	xmm4,xmm6,xmm5,0x11
+	vpxor	xmm2,xmm2,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x00
+	vpxor	xmm3,xmm3,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x10
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm4,xmm6,xmm5,0x01
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrldq	xmm4,xmm1,8
+	vpxor	xmm5,xmm2,xmm4
+	vpslldq	xmm4,xmm1,8
+	vpxor	xmm0,xmm3,xmm4
+
+	vmovdqa	xmm3,XMMWORD[poly]
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpalignr	xmm2,xmm0,xmm0,8
+	vpclmulqdq	xmm0,xmm0,xmm3,0x10
+	vpxor	xmm0,xmm2,xmm0
+
+	vpxor	xmm0,xmm0,xmm5
+
+$L$256_dec_loop2:
+
+
+
+	cmp	r9,16
+	jb	NEAR $L$256_dec_out
+	sub	r9,16
+
+	vmovdqa	xmm2,xmm15
+	vpaddd	xmm15,xmm15,XMMWORD[one]
+
+	vpxor	xmm2,xmm2,XMMWORD[r8]
+	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[160+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[176+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[192+r8]
+	vaesenc	xmm2,xmm2,XMMWORD[208+r8]
+	vaesenclast	xmm2,xmm2,XMMWORD[224+r8]
+	vpxor	xmm2,xmm2,XMMWORD[rdi]
+	vmovdqu	XMMWORD[rsi],xmm2
+	add	rdi,16
+	add	rsi,16
+
+	vpxor	xmm0,xmm0,xmm2
+	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
+	call	GFMUL
+
+	jmp	NEAR $L$256_dec_loop2
+
+$L$256_dec_out:
+	vmovdqu	XMMWORD[rdx],xmm0
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes256gcmsiv_dec:
+global	aes256gcmsiv_kdf
+
+ALIGN	16
+aes256gcmsiv_kdf:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aes256gcmsiv_kdf:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+_CET_ENDBR
+
+
+
+
+	vmovdqa	xmm1,XMMWORD[rdx]
+	vmovdqa	xmm4,XMMWORD[rdi]
+	vmovdqa	xmm11,XMMWORD[and_mask]
+	vmovdqa	xmm8,XMMWORD[one]
+	vpshufd	xmm4,xmm4,0x90
+	vpand	xmm4,xmm4,xmm11
+	vpaddd	xmm6,xmm4,xmm8
+	vpaddd	xmm7,xmm6,xmm8
+	vpaddd	xmm11,xmm7,xmm8
+	vpaddd	xmm12,xmm11,xmm8
+	vpaddd	xmm13,xmm12,xmm8
+
+	vpxor	xmm4,xmm4,xmm1
+	vpxor	xmm6,xmm6,xmm1
+	vpxor	xmm7,xmm7,xmm1
+	vpxor	xmm11,xmm11,xmm1
+	vpxor	xmm12,xmm12,xmm1
+	vpxor	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm1,XMMWORD[16+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[32+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[48+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[64+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[80+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[96+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[112+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[128+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[144+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[160+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[176+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[192+rdx]
+	vaesenc	xmm4,xmm4,xmm2
+	vaesenc	xmm6,xmm6,xmm2
+	vaesenc	xmm7,xmm7,xmm2
+	vaesenc	xmm11,xmm11,xmm2
+	vaesenc	xmm12,xmm12,xmm2
+	vaesenc	xmm13,xmm13,xmm2
+
+	vmovdqa	xmm1,XMMWORD[208+rdx]
+	vaesenc	xmm4,xmm4,xmm1
+	vaesenc	xmm6,xmm6,xmm1
+	vaesenc	xmm7,xmm7,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+
+	vmovdqa	xmm2,XMMWORD[224+rdx]
+	vaesenclast	xmm4,xmm4,xmm2
+	vaesenclast	xmm6,xmm6,xmm2
+	vaesenclast	xmm7,xmm7,xmm2
+	vaesenclast	xmm11,xmm11,xmm2
+	vaesenclast	xmm12,xmm12,xmm2
+	vaesenclast	xmm13,xmm13,xmm2
+
+
+	vmovdqa	XMMWORD[rsi],xmm4
+	vmovdqa	XMMWORD[16+rsi],xmm6
+	vmovdqa	XMMWORD[32+rsi],xmm7
+	vmovdqa	XMMWORD[48+rsi],xmm11
+	vmovdqa	XMMWORD[64+rsi],xmm12
+	vmovdqa	XMMWORD[80+rsi],xmm13
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_aes256gcmsiv_kdf:
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha-armv4-linux.S b/gen/crypto/chacha-armv4-linux.S
new file mode 100644
index 0000000..2255dd2
--- /dev/null
+++ b/gen/crypto/chacha-armv4-linux.S
@@ -0,0 +1,1451 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+
+.globl	ChaCha20_ctr32_nohw
+.hidden	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	adr	r14,.Lsigma
+	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
+	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	.Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __ARM_ARCH>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH<7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	r11,#64		@ restore flags
+# else
+	ldr	r10,[sp,#4*(2)]
+# endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1		@ next counter value
+	strhi	r8,[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+# if __ARM_ARCH<7
+	b	.Ltail
+
+.align	4
+.Lunaligned:@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH<7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	ChaCha20_ctr32_neon
+.hidden	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	adr	r14,.Lsigma
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
+	stmdb	sp!,{r0,r1,r2,r3}
+
+	vld1.32	{q1,q2},[r3]		@ load key
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+
+	sub	sp,sp,#4*(16+16)
+	vld1.32	{q3},[r12]		@ load counter and nonce
+	add	r12,sp,#4*8
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	vld1.32	{q0},[r14]!		@ load sigma
+	vld1.32	{q12},[r14]		@ one
+	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
+
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr	d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr	d26,[sp,#4*(16+2)]
+	vmov	q4,q0
+	vstr	d28,[sp,#4*(16+4)]
+	vmov	q8,q0
+	vmov	q5,q1
+	vmov	q9,q1
+	b	.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	cmp	r11,#64*2		@ if len<=64*2
+	bls	.Lbreak_neon		@ switch to integer-only
+	vmov	q4,q0
+	str	r11,[sp,#4*(32+2)]	@ save len
+	vmov	q8,q0
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	vmov	q5,q1
+	str	r14,  [sp,#4*(32+0)]	@ save out
+	vmov	q9,q1
+.Loop_neon_enter:
+	ldr	r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov	q6,q2
+	ldr	r10, [sp,#4*(13)]
+	vmov	q10,q2
+	ldr	r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	add	r12,r12,#3	@ counter+3
+	b	.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs	r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne	.Loop_neon
+
+	add	r11,sp,#32
+	vld1.32	{q12,q13},[sp]		@ load key material
+	vld1.32	{q14,q15},[r11]
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr	d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp	r11,#64*4
+	blo	.Ltail_neon
+
+	vld1.8	{q12,q13},[r12]!	@ load input
+	mov	r11,sp
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12		@ xor with input
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	vst1.8	{q0,q1},[r14]!	@ store output
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vld1.32	{q0,q1},[r11]!	@ load for next iteration
+	veor	d25,d25,d25
+	vldr	d24,[sp,#4*(16+4)]	@ four
+	veor	q9,q9,q13
+	vld1.32	{q2,q3},[r11]
+	veor	q10,q10,q14
+	vst1.8	{q4,q5},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q6,q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	vst1.8	{q8,q9},[r14]!
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+	vst1.8	{q10,q11},[r14]!
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8	@ xor with input
+	add	r8,sp,#4*(4)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r5,r5,r9
+	ldr	r9,[r12,#-12]
+	add	r6,r6,r10
+	ldr	r10,[r12,#-8]
+	add	r7,r7,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+	add	r8,sp,#4*(8)
+	eor	r5,r5,r9
+	str	r4,[r14],#16		@ store output
+	eor	r6,r6,r10
+	str	r5,[r14,#-12]
+	eor	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8
+	add	r8,sp,#4*(12)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,r8,#4		@ next counter value
+	add	r5,r5,r9
+	str	r8,[sp,#4*(12)]	@ save next counter value
+	ldr	r8,[r12],#16		@ load input
+	add	r6,r6,r10
+	add	r4,r4,#3		@ counter+3
+	ldr	r9,[r12,#-12]
+	add	r7,r7,r11
+	ldr	r10,[r12,#-8]
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+# ifdef	__thumb2__
+	it	hi
+# endif
+	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
+	eor	r5,r5,r9
+	eor	r6,r6,r10
+	str	r4,[r14],#16		@ store output
+	eor	r7,r7,r11
+	str	r5,[r14,#-12]
+	sub	r11,r8,#64*4	@ len-=64*4
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_neon_outer
+
+	b	.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str	r11, [sp,#4*(20+32+2)]	@ save len
+	add	r11,sp,#4*(32+4)
+	str	r12,   [sp,#4*(20+32+1)]	@ save inp
+	str	r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr	r12,[sp,#4*(16+10)]
+	ldr	r14,[sp,#4*(16+11)]
+	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
+	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(20+16+15)]
+	add	r11,sp,#4*(20)
+	vst1.32	{q0,q1},[r11]!		@ copy key
+	add	sp,sp,#4*(20)			@ switch frame
+	vst1.32	{q2,q3},[r11]
+	mov	r11,#10
+	b	.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp	r11,#64*3
+	bhs	.L192_or_more_neon
+	cmp	r11,#64*2
+	bhs	.L128_or_more_neon
+	cmp	r11,#64*1
+	bhs	.L64_or_more_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q0,q1},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q2,q3},[r8]
+	b	.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vst1.8	{q0,q1},[r14]!
+	vst1.8	{q2,q3},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q4,q5},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q6,q7},[r8]
+	sub	r11,r11,#64*1	@ len-=64*1
+	b	.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vst1.8	{q0,q1},[r14]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vst1.8	{q4,q5},[r14]!
+	vst1.8	{q6,q7},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q8,q9},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q10,q11},[r8]
+	sub	r11,r11,#64*2	@ len-=64*2
+	b	.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q0,q1},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vst1.8	{q2,q3},[r14]!
+	veor	q9,q9,q13
+	vst1.8	{q4,q5},[r14]!
+	veor	q10,q10,q14
+	vst1.8	{q6,q7},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q8,q9},[r14]!
+	vst1.8	{q10,q11},[r14]!
+
+	beq	.Ldone_neon
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(4)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r0,sp,#4*(16+8)
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(12)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r4,r4,#3		@ counter+3
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldr	r11,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r10,sp,#4*(0)
+	sub	r11,r11,#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb	r8,[r10],#1	@ read buffer on stack
+	ldrb	r9,[r12],#1		@ read input
+	subs	r11,r11,#1
+	eor	r8,r8,r9
+	strb	r8,[r14],#1		@ store output
+	bne	.Loop_tail_neon
+
+.Ldone_neon:
+	add	sp,sp,#4*(32+4)
+	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+	add	sp,sp,#4*(16+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/crypto/chacha-armv8-apple.S b/gen/crypto/chacha-armv8-apple.S
new file mode 100644
index 0000000..3807631
--- /dev/null
+++ b/gen/crypto/chacha-armv8-apple.S
@@ -0,0 +1,1968 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.section	__TEXT,__const
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	_ChaCha20_ctr32_nohw
+.private_extern	_ChaCha20_ctr32_nohw
+
+.align	5
+_ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_ChaCha20_ctr32_neon
+.private_extern	_ChaCha20_ctr32_neon
+
+.align	5
+_ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/crypto/chacha-armv8-linux.S b/gen/crypto/chacha-armv8-linux.S
new file mode 100644
index 0000000..55fa583
--- /dev/null
+++ b/gen/crypto/chacha-armv8-linux.S
@@ -0,0 +1,1968 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.section	.rodata
+
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+.hidden	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+.Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+.Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,.Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	.Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+.Ltail:
+	add	x2,x2,#64
+.Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+
+.globl	ChaCha20_ctr32_neon
+.hidden	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+.Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,.Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	.Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	.Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+.Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+.type	ChaCha20_512_neon,%function
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+.Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+.Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+.Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/crypto/chacha-armv8-win.S b/gen/crypto/chacha-armv8-win.S
new file mode 100644
index 0000000..851ef4d
--- /dev/null
+++ b/gen/crypto/chacha-armv8-win.S
@@ -0,0 +1,1974 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.section	.rodata
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+
+.def ChaCha20_ctr32_nohw
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	ChaCha20_ctr32_neon
+
+.def ChaCha20_ctr32_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def ChaCha20_512_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/crypto/chacha-x86-apple.S b/gen/crypto/chacha-x86-apple.S
new file mode 100644
index 0000000..48293da
--- /dev/null
+++ b/gen/crypto/chacha-x86-apple.S
@@ -0,0 +1,957 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_ChaCha20_ctr32_nohw
+.private_extern	_ChaCha20_ctr32_nohw
+.align	4
+_ChaCha20_ctr32_nohw:
+L_ChaCha20_ctr32_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	subl	$132,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,80(%esp)
+	movl	%ebx,84(%esp)
+	movl	%ecx,88(%esp)
+	movl	%edx,92(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,96(%esp)
+	movl	%ebx,100(%esp)
+	movl	%ecx,104(%esp)
+	movl	%edx,108(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	subl	$1,%eax
+	movl	%eax,112(%esp)
+	movl	%ebx,116(%esp)
+	movl	%ecx,120(%esp)
+	movl	%edx,124(%esp)
+	jmp	L000entry
+.align	4,0x90
+L001outer_loop:
+	movl	%ebx,156(%esp)
+	movl	%eax,152(%esp)
+	movl	%ecx,160(%esp)
+L000entry:
+	movl	$1634760805,%eax
+	movl	$857760878,4(%esp)
+	movl	$2036477234,8(%esp)
+	movl	$1797285236,12(%esp)
+	movl	84(%esp),%ebx
+	movl	88(%esp),%ebp
+	movl	104(%esp),%ecx
+	movl	108(%esp),%esi
+	movl	116(%esp),%edx
+	movl	120(%esp),%edi
+	movl	%ebx,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ecx,40(%esp)
+	movl	%esi,44(%esp)
+	movl	%edx,52(%esp)
+	movl	%edi,56(%esp)
+	movl	92(%esp),%ebx
+	movl	124(%esp),%edi
+	movl	112(%esp),%edx
+	movl	80(%esp),%ebp
+	movl	96(%esp),%ecx
+	movl	100(%esp),%esi
+	addl	$1,%edx
+	movl	%ebx,28(%esp)
+	movl	%edi,60(%esp)
+	movl	%edx,112(%esp)
+	movl	$10,%ebx
+	jmp	L002loop
+.align	4,0x90
+L002loop:
+	addl	%ebp,%eax
+	movl	%ebx,128(%esp)
+	movl	%ebp,%ebx
+	xorl	%eax,%edx
+	roll	$16,%edx
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	52(%esp),%edi
+	roll	$12,%ebx
+	movl	20(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,48(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,32(%esp)
+	roll	$16,%edi
+	movl	%ebx,16(%esp)
+	addl	%edi,%esi
+	movl	40(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	56(%esp),%edx
+	roll	$12,%ebp
+	movl	24(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,52(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,36(%esp)
+	roll	$16,%edx
+	movl	%ebp,20(%esp)
+	addl	%edx,%ecx
+	movl	44(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	60(%esp),%edi
+	roll	$12,%ebx
+	movl	28(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,56(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,24(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	roll	$12,%ebp
+	movl	20(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,%edx
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	roll	$16,%edx
+	movl	%ebp,28(%esp)
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	48(%esp),%edi
+	roll	$12,%ebx
+	movl	24(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,60(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,40(%esp)
+	roll	$16,%edi
+	movl	%ebx,20(%esp)
+	addl	%edi,%esi
+	movl	32(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	52(%esp),%edx
+	roll	$12,%ebp
+	movl	28(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,48(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,44(%esp)
+	roll	$16,%edx
+	movl	%ebp,24(%esp)
+	addl	%edx,%ecx
+	movl	36(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	56(%esp),%edi
+	roll	$12,%ebx
+	movl	16(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,52(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,28(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	movl	48(%esp),%edx
+	roll	$12,%ebp
+	movl	128(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,56(%esp)
+	xorl	%esi,%ebp
+	roll	$7,%ebp
+	decl	%ebx
+	jnz	L002loop
+	movl	160(%esp),%ebx
+	addl	$1634760805,%eax
+	addl	80(%esp),%ebp
+	addl	96(%esp),%ecx
+	addl	100(%esp),%esi
+	cmpl	$64,%ebx
+	jb	L003tail
+	movl	156(%esp),%ebx
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	xorl	(%ebx),%eax
+	xorl	16(%ebx),%ebp
+	movl	%eax,(%esp)
+	movl	152(%esp),%eax
+	xorl	32(%ebx),%ecx
+	xorl	36(%ebx),%esi
+	xorl	48(%ebx),%edx
+	xorl	56(%ebx),%edi
+	movl	%ebp,16(%eax)
+	movl	%ecx,32(%eax)
+	movl	%esi,36(%eax)
+	movl	%edx,48(%eax)
+	movl	%edi,56(%eax)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	xorl	4(%ebx),%ebp
+	xorl	8(%ebx),%ecx
+	xorl	12(%ebx),%esi
+	xorl	20(%ebx),%edx
+	xorl	24(%ebx),%edi
+	movl	%ebp,4(%eax)
+	movl	%ecx,8(%eax)
+	movl	%esi,12(%eax)
+	movl	%edx,20(%eax)
+	movl	%edi,24(%eax)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	xorl	28(%ebx),%ebp
+	xorl	40(%ebx),%ecx
+	xorl	44(%ebx),%esi
+	xorl	52(%ebx),%edx
+	xorl	60(%ebx),%edi
+	leal	64(%ebx),%ebx
+	movl	%ebp,28(%eax)
+	movl	(%esp),%ebp
+	movl	%ecx,40(%eax)
+	movl	160(%esp),%ecx
+	movl	%esi,44(%eax)
+	movl	%edx,52(%eax)
+	movl	%edi,60(%eax)
+	movl	%ebp,(%eax)
+	leal	64(%eax),%eax
+	subl	$64,%ecx
+	jnz	L001outer_loop
+	jmp	L004done
+L003tail:
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	movl	%eax,(%esp)
+	movl	%ebp,16(%esp)
+	movl	%ecx,32(%esp)
+	movl	%esi,36(%esp)
+	movl	%edx,48(%esp)
+	movl	%edi,56(%esp)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	movl	%ebp,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%esi,12(%esp)
+	movl	%edx,20(%esp)
+	movl	%edi,24(%esp)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	movl	%ebp,28(%esp)
+	movl	156(%esp),%ebp
+	movl	%ecx,40(%esp)
+	movl	152(%esp),%ecx
+	movl	%esi,44(%esp)
+	xorl	%esi,%esi
+	movl	%edx,52(%esp)
+	movl	%edi,60(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+L005tail_loop:
+	movb	(%esi,%ebp,1),%al
+	movb	(%esp,%esi,1),%dl
+	leal	1(%esi),%esi
+	xorb	%dl,%al
+	movb	%al,-1(%ecx,%esi,1)
+	decl	%ebx
+	jnz	L005tail_loop
+L004done:
+	addl	$132,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_ChaCha20_ctr32_ssse3
+.private_extern	_ChaCha20_ctr32_ssse3
+.align	4
+_ChaCha20_ctr32_ssse3:
+L_ChaCha20_ctr32_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	Lpic_point
+Lpic_point:
+	popl	%eax
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$524,%esp
+	andl	$-64,%esp
+	movl	%ebp,512(%esp)
+	leal	Lssse3_data-Lpic_point(%eax),%eax
+	movdqu	(%ebx),%xmm3
+	cmpl	$256,%ecx
+	jb	L0061x
+	movl	%edx,516(%esp)
+	movl	%ebx,520(%esp)
+	subl	$256,%ecx
+	leal	384(%esp),%ebp
+	movdqu	(%edx),%xmm7
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	paddd	48(%eax),%xmm0
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	psubd	64(%eax),%xmm0
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,64(%ebp)
+	movdqa	%xmm1,80(%ebp)
+	movdqa	%xmm2,96(%ebp)
+	movdqa	%xmm3,112(%ebp)
+	movdqu	16(%edx),%xmm3
+	movdqa	%xmm4,-64(%ebp)
+	movdqa	%xmm5,-48(%ebp)
+	movdqa	%xmm6,-32(%ebp)
+	movdqa	%xmm7,-16(%ebp)
+	movdqa	32(%eax),%xmm7
+	leal	128(%esp),%ebx
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,(%ebp)
+	movdqa	%xmm1,16(%ebp)
+	movdqa	%xmm2,32(%ebp)
+	movdqa	%xmm3,48(%ebp)
+	movdqa	%xmm4,-128(%ebp)
+	movdqa	%xmm5,-112(%ebp)
+	movdqa	%xmm6,-96(%ebp)
+	movdqa	%xmm7,-80(%ebp)
+	leal	128(%esi),%esi
+	leal	128(%edi),%edi
+	jmp	L007outer_loop
+.align	4,0x90
+L007outer_loop:
+	movdqa	-112(%ebp),%xmm1
+	movdqa	-96(%ebp),%xmm2
+	movdqa	-80(%ebp),%xmm3
+	movdqa	-48(%ebp),%xmm5
+	movdqa	-32(%ebp),%xmm6
+	movdqa	-16(%ebp),%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	movdqa	%xmm2,-96(%ebx)
+	movdqa	%xmm3,-80(%ebx)
+	movdqa	%xmm5,-48(%ebx)
+	movdqa	%xmm6,-32(%ebx)
+	movdqa	%xmm7,-16(%ebx)
+	movdqa	32(%ebp),%xmm2
+	movdqa	48(%ebp),%xmm3
+	movdqa	64(%ebp),%xmm4
+	movdqa	80(%ebp),%xmm5
+	movdqa	96(%ebp),%xmm6
+	movdqa	112(%ebp),%xmm7
+	paddd	64(%eax),%xmm4
+	movdqa	%xmm2,32(%ebx)
+	movdqa	%xmm3,48(%ebx)
+	movdqa	%xmm4,64(%ebx)
+	movdqa	%xmm5,80(%ebx)
+	movdqa	%xmm6,96(%ebx)
+	movdqa	%xmm7,112(%ebx)
+	movdqa	%xmm4,64(%ebp)
+	movdqa	-128(%ebp),%xmm0
+	movdqa	%xmm4,%xmm6
+	movdqa	-64(%ebp),%xmm3
+	movdqa	(%ebp),%xmm4
+	movdqa	16(%ebp),%xmm5
+	movl	$10,%edx
+	nop
+.align	4,0x90
+L008loop:
+	paddd	%xmm3,%xmm0
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm0,%xmm6
+	pshufb	(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-48(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	80(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,64(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-64(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-32(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	96(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,80(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,16(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-48(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	48(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-16(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	112(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,96(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-32(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	-48(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,%xmm6
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-16(%ebx)
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-32(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	64(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,112(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,32(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-48(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-16(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	80(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,64(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,48(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-32(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	16(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-64(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	96(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,80(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-16(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	64(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,96(%ebx)
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	por	%xmm1,%xmm3
+	decl	%edx
+	jnz	L008loop
+	movdqa	%xmm3,-64(%ebx)
+	movdqa	%xmm4,(%ebx)
+	movdqa	%xmm5,16(%ebx)
+	movdqa	%xmm6,64(%ebx)
+	movdqa	%xmm7,96(%ebx)
+	movdqa	-112(%ebx),%xmm1
+	movdqa	-96(%ebx),%xmm2
+	movdqa	-80(%ebx),%xmm3
+	paddd	-128(%ebp),%xmm0
+	paddd	-112(%ebp),%xmm1
+	paddd	-96(%ebp),%xmm2
+	paddd	-80(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	-64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	-48(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	-32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	-16(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	-64(%ebp),%xmm0
+	paddd	-48(%ebp),%xmm1
+	paddd	-32(%ebp),%xmm2
+	paddd	-16(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	16(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	48(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	(%ebp),%xmm0
+	paddd	16(%ebp),%xmm1
+	paddd	32(%ebp),%xmm2
+	paddd	48(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	80(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	96(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	112(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	64(%ebp),%xmm0
+	paddd	80(%ebp),%xmm1
+	paddd	96(%ebp),%xmm2
+	paddd	112(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	208(%esi),%esi
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm5
+	pxor	%xmm2,%xmm6
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	208(%edi),%edi
+	subl	$256,%ecx
+	jnc	L007outer_loop
+	addl	$256,%ecx
+	jz	L009done
+	movl	520(%esp),%ebx
+	leal	-128(%esi),%esi
+	movl	516(%esp),%edx
+	leal	-128(%edi),%edi
+	movd	64(%ebp),%xmm2
+	movdqu	(%ebx),%xmm3
+	paddd	96(%eax),%xmm2
+	pand	112(%eax),%xmm3
+	por	%xmm2,%xmm3
+L0061x:
+	movdqa	32(%eax),%xmm0
+	movdqu	(%edx),%xmm1
+	movdqu	16(%edx),%xmm2
+	movdqa	(%eax),%xmm6
+	movdqa	16(%eax),%xmm7
+	movl	%ebp,48(%esp)
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	movl	$10,%edx
+	jmp	L010loop1x
+.align	4,0x90
+L011outer1x:
+	movdqa	80(%eax),%xmm3
+	movdqa	(%esp),%xmm0
+	movdqa	16(%esp),%xmm1
+	movdqa	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	movl	$10,%edx
+	movdqa	%xmm3,48(%esp)
+	jmp	L010loop1x
+.align	4,0x90
+L010loop1x:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decl	%edx
+	jnz	L010loop1x
+	paddd	(%esp),%xmm0
+	paddd	16(%esp),%xmm1
+	paddd	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	cmpl	$64,%ecx
+	jb	L012tail
+	movdqu	(%esi),%xmm4
+	movdqu	16(%esi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+	leal	64(%esi),%esi
+	movdqu	%xmm0,(%edi)
+	movdqu	%xmm1,16(%edi)
+	movdqu	%xmm2,32(%edi)
+	movdqu	%xmm3,48(%edi)
+	leal	64(%edi),%edi
+	subl	$64,%ecx
+	jnz	L011outer1x
+	jmp	L009done
+L012tail:
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	xorl	%ebp,%ebp
+L013tail_loop:
+	movb	(%esp,%ebp,1),%al
+	movb	(%esi,%ebp,1),%dl
+	leal	1(%ebp),%ebp
+	xorb	%dl,%al
+	movb	%al,-1(%edi,%ebp,1)
+	decl	%ecx
+	jnz	L013tail_loop
+L009done:
+	movl	512(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+Lssse3_data:
+.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long	1634760805,857760878,2036477234,1797285236
+.long	0,1,2,3
+.long	4,4,4,4
+.long	1,0,0,0
+.long	4,0,0,0
+.long	0,-1,-1,-1
+.align	6,0x90
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte	114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/crypto/chacha-x86-linux.S b/gen/crypto/chacha-x86-linux.S
new file mode 100644
index 0000000..566fbb4
--- /dev/null
+++ b/gen/crypto/chacha-x86-linux.S
@@ -0,0 +1,961 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	ChaCha20_ctr32_nohw
+.hidden	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,@function
+.align	16
+ChaCha20_ctr32_nohw:
+.L_ChaCha20_ctr32_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	subl	$132,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,80(%esp)
+	movl	%ebx,84(%esp)
+	movl	%ecx,88(%esp)
+	movl	%edx,92(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,96(%esp)
+	movl	%ebx,100(%esp)
+	movl	%ecx,104(%esp)
+	movl	%edx,108(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	subl	$1,%eax
+	movl	%eax,112(%esp)
+	movl	%ebx,116(%esp)
+	movl	%ecx,120(%esp)
+	movl	%edx,124(%esp)
+	jmp	.L000entry
+.align	16
+.L001outer_loop:
+	movl	%ebx,156(%esp)
+	movl	%eax,152(%esp)
+	movl	%ecx,160(%esp)
+.L000entry:
+	movl	$1634760805,%eax
+	movl	$857760878,4(%esp)
+	movl	$2036477234,8(%esp)
+	movl	$1797285236,12(%esp)
+	movl	84(%esp),%ebx
+	movl	88(%esp),%ebp
+	movl	104(%esp),%ecx
+	movl	108(%esp),%esi
+	movl	116(%esp),%edx
+	movl	120(%esp),%edi
+	movl	%ebx,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ecx,40(%esp)
+	movl	%esi,44(%esp)
+	movl	%edx,52(%esp)
+	movl	%edi,56(%esp)
+	movl	92(%esp),%ebx
+	movl	124(%esp),%edi
+	movl	112(%esp),%edx
+	movl	80(%esp),%ebp
+	movl	96(%esp),%ecx
+	movl	100(%esp),%esi
+	addl	$1,%edx
+	movl	%ebx,28(%esp)
+	movl	%edi,60(%esp)
+	movl	%edx,112(%esp)
+	movl	$10,%ebx
+	jmp	.L002loop
+.align	16
+.L002loop:
+	addl	%ebp,%eax
+	movl	%ebx,128(%esp)
+	movl	%ebp,%ebx
+	xorl	%eax,%edx
+	roll	$16,%edx
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	52(%esp),%edi
+	roll	$12,%ebx
+	movl	20(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,48(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,32(%esp)
+	roll	$16,%edi
+	movl	%ebx,16(%esp)
+	addl	%edi,%esi
+	movl	40(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	56(%esp),%edx
+	roll	$12,%ebp
+	movl	24(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,52(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,36(%esp)
+	roll	$16,%edx
+	movl	%ebp,20(%esp)
+	addl	%edx,%ecx
+	movl	44(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	60(%esp),%edi
+	roll	$12,%ebx
+	movl	28(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,56(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,24(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	roll	$12,%ebp
+	movl	20(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,%edx
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	roll	$16,%edx
+	movl	%ebp,28(%esp)
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	48(%esp),%edi
+	roll	$12,%ebx
+	movl	24(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,60(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,40(%esp)
+	roll	$16,%edi
+	movl	%ebx,20(%esp)
+	addl	%edi,%esi
+	movl	32(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	52(%esp),%edx
+	roll	$12,%ebp
+	movl	28(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,48(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,44(%esp)
+	roll	$16,%edx
+	movl	%ebp,24(%esp)
+	addl	%edx,%ecx
+	movl	36(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	56(%esp),%edi
+	roll	$12,%ebx
+	movl	16(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,52(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,28(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	movl	48(%esp),%edx
+	roll	$12,%ebp
+	movl	128(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,56(%esp)
+	xorl	%esi,%ebp
+	roll	$7,%ebp
+	decl	%ebx
+	jnz	.L002loop
+	movl	160(%esp),%ebx
+	addl	$1634760805,%eax
+	addl	80(%esp),%ebp
+	addl	96(%esp),%ecx
+	addl	100(%esp),%esi
+	cmpl	$64,%ebx
+	jb	.L003tail
+	movl	156(%esp),%ebx
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	xorl	(%ebx),%eax
+	xorl	16(%ebx),%ebp
+	movl	%eax,(%esp)
+	movl	152(%esp),%eax
+	xorl	32(%ebx),%ecx
+	xorl	36(%ebx),%esi
+	xorl	48(%ebx),%edx
+	xorl	56(%ebx),%edi
+	movl	%ebp,16(%eax)
+	movl	%ecx,32(%eax)
+	movl	%esi,36(%eax)
+	movl	%edx,48(%eax)
+	movl	%edi,56(%eax)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	xorl	4(%ebx),%ebp
+	xorl	8(%ebx),%ecx
+	xorl	12(%ebx),%esi
+	xorl	20(%ebx),%edx
+	xorl	24(%ebx),%edi
+	movl	%ebp,4(%eax)
+	movl	%ecx,8(%eax)
+	movl	%esi,12(%eax)
+	movl	%edx,20(%eax)
+	movl	%edi,24(%eax)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	xorl	28(%ebx),%ebp
+	xorl	40(%ebx),%ecx
+	xorl	44(%ebx),%esi
+	xorl	52(%ebx),%edx
+	xorl	60(%ebx),%edi
+	leal	64(%ebx),%ebx
+	movl	%ebp,28(%eax)
+	movl	(%esp),%ebp
+	movl	%ecx,40(%eax)
+	movl	160(%esp),%ecx
+	movl	%esi,44(%eax)
+	movl	%edx,52(%eax)
+	movl	%edi,60(%eax)
+	movl	%ebp,(%eax)
+	leal	64(%eax),%eax
+	subl	$64,%ecx
+	jnz	.L001outer_loop
+	jmp	.L004done
+.L003tail:
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	movl	%eax,(%esp)
+	movl	%ebp,16(%esp)
+	movl	%ecx,32(%esp)
+	movl	%esi,36(%esp)
+	movl	%edx,48(%esp)
+	movl	%edi,56(%esp)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	movl	%ebp,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%esi,12(%esp)
+	movl	%edx,20(%esp)
+	movl	%edi,24(%esp)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	movl	%ebp,28(%esp)
+	movl	156(%esp),%ebp
+	movl	%ecx,40(%esp)
+	movl	152(%esp),%ecx
+	movl	%esi,44(%esp)
+	xorl	%esi,%esi
+	movl	%edx,52(%esp)
+	movl	%edi,60(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+.L005tail_loop:
+	movb	(%esi,%ebp,1),%al
+	movb	(%esp,%esi,1),%dl
+	leal	1(%esi),%esi
+	xorb	%dl,%al
+	movb	%al,-1(%ecx,%esi,1)
+	decl	%ebx
+	jnz	.L005tail_loop
+.L004done:
+	addl	$132,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	ChaCha20_ctr32_nohw,.-.L_ChaCha20_ctr32_nohw_begin
+.globl	ChaCha20_ctr32_ssse3
+.hidden	ChaCha20_ctr32_ssse3
+.type	ChaCha20_ctr32_ssse3,@function
+.align	16
+ChaCha20_ctr32_ssse3:
+.L_ChaCha20_ctr32_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.Lpic_point
+.Lpic_point:
+	popl	%eax
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$524,%esp
+	andl	$-64,%esp
+	movl	%ebp,512(%esp)
+	leal	.Lssse3_data-.Lpic_point(%eax),%eax
+	movdqu	(%ebx),%xmm3
+	cmpl	$256,%ecx
+	jb	.L0061x
+	movl	%edx,516(%esp)
+	movl	%ebx,520(%esp)
+	subl	$256,%ecx
+	leal	384(%esp),%ebp
+	movdqu	(%edx),%xmm7
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	paddd	48(%eax),%xmm0
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	psubd	64(%eax),%xmm0
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,64(%ebp)
+	movdqa	%xmm1,80(%ebp)
+	movdqa	%xmm2,96(%ebp)
+	movdqa	%xmm3,112(%ebp)
+	movdqu	16(%edx),%xmm3
+	movdqa	%xmm4,-64(%ebp)
+	movdqa	%xmm5,-48(%ebp)
+	movdqa	%xmm6,-32(%ebp)
+	movdqa	%xmm7,-16(%ebp)
+	movdqa	32(%eax),%xmm7
+	leal	128(%esp),%ebx
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,(%ebp)
+	movdqa	%xmm1,16(%ebp)
+	movdqa	%xmm2,32(%ebp)
+	movdqa	%xmm3,48(%ebp)
+	movdqa	%xmm4,-128(%ebp)
+	movdqa	%xmm5,-112(%ebp)
+	movdqa	%xmm6,-96(%ebp)
+	movdqa	%xmm7,-80(%ebp)
+	leal	128(%esi),%esi
+	leal	128(%edi),%edi
+	jmp	.L007outer_loop
+.align	16
+.L007outer_loop:
+	movdqa	-112(%ebp),%xmm1
+	movdqa	-96(%ebp),%xmm2
+	movdqa	-80(%ebp),%xmm3
+	movdqa	-48(%ebp),%xmm5
+	movdqa	-32(%ebp),%xmm6
+	movdqa	-16(%ebp),%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	movdqa	%xmm2,-96(%ebx)
+	movdqa	%xmm3,-80(%ebx)
+	movdqa	%xmm5,-48(%ebx)
+	movdqa	%xmm6,-32(%ebx)
+	movdqa	%xmm7,-16(%ebx)
+	movdqa	32(%ebp),%xmm2
+	movdqa	48(%ebp),%xmm3
+	movdqa	64(%ebp),%xmm4
+	movdqa	80(%ebp),%xmm5
+	movdqa	96(%ebp),%xmm6
+	movdqa	112(%ebp),%xmm7
+	paddd	64(%eax),%xmm4
+	movdqa	%xmm2,32(%ebx)
+	movdqa	%xmm3,48(%ebx)
+	movdqa	%xmm4,64(%ebx)
+	movdqa	%xmm5,80(%ebx)
+	movdqa	%xmm6,96(%ebx)
+	movdqa	%xmm7,112(%ebx)
+	movdqa	%xmm4,64(%ebp)
+	movdqa	-128(%ebp),%xmm0
+	movdqa	%xmm4,%xmm6
+	movdqa	-64(%ebp),%xmm3
+	movdqa	(%ebp),%xmm4
+	movdqa	16(%ebp),%xmm5
+	movl	$10,%edx
+	nop
+.align	16
+.L008loop:
+	paddd	%xmm3,%xmm0
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm0,%xmm6
+	pshufb	(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-48(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	80(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,64(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-64(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-32(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	96(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,80(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,16(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-48(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	48(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-16(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	112(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,96(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-32(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	-48(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,%xmm6
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-16(%ebx)
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-32(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	64(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,112(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,32(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-48(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-16(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	80(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,64(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,48(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-32(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	16(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-64(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	96(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,80(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-16(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	64(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,96(%ebx)
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	por	%xmm1,%xmm3
+	decl	%edx
+	jnz	.L008loop
+	movdqa	%xmm3,-64(%ebx)
+	movdqa	%xmm4,(%ebx)
+	movdqa	%xmm5,16(%ebx)
+	movdqa	%xmm6,64(%ebx)
+	movdqa	%xmm7,96(%ebx)
+	movdqa	-112(%ebx),%xmm1
+	movdqa	-96(%ebx),%xmm2
+	movdqa	-80(%ebx),%xmm3
+	paddd	-128(%ebp),%xmm0
+	paddd	-112(%ebp),%xmm1
+	paddd	-96(%ebp),%xmm2
+	paddd	-80(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	-64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	-48(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	-32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	-16(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	-64(%ebp),%xmm0
+	paddd	-48(%ebp),%xmm1
+	paddd	-32(%ebp),%xmm2
+	paddd	-16(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	16(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	48(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	(%ebp),%xmm0
+	paddd	16(%ebp),%xmm1
+	paddd	32(%ebp),%xmm2
+	paddd	48(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	80(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	96(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	112(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	64(%ebp),%xmm0
+	paddd	80(%ebp),%xmm1
+	paddd	96(%ebp),%xmm2
+	paddd	112(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	208(%esi),%esi
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm5
+	pxor	%xmm2,%xmm6
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	208(%edi),%edi
+	subl	$256,%ecx
+	jnc	.L007outer_loop
+	addl	$256,%ecx
+	jz	.L009done
+	movl	520(%esp),%ebx
+	leal	-128(%esi),%esi
+	movl	516(%esp),%edx
+	leal	-128(%edi),%edi
+	movd	64(%ebp),%xmm2
+	movdqu	(%ebx),%xmm3
+	paddd	96(%eax),%xmm2
+	pand	112(%eax),%xmm3
+	por	%xmm2,%xmm3
+.L0061x:
+	movdqa	32(%eax),%xmm0
+	movdqu	(%edx),%xmm1
+	movdqu	16(%edx),%xmm2
+	movdqa	(%eax),%xmm6
+	movdqa	16(%eax),%xmm7
+	movl	%ebp,48(%esp)
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	movl	$10,%edx
+	jmp	.L010loop1x
+.align	16
+.L011outer1x:
+	movdqa	80(%eax),%xmm3
+	movdqa	(%esp),%xmm0
+	movdqa	16(%esp),%xmm1
+	movdqa	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	movl	$10,%edx
+	movdqa	%xmm3,48(%esp)
+	jmp	.L010loop1x
+.align	16
+.L010loop1x:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decl	%edx
+	jnz	.L010loop1x
+	paddd	(%esp),%xmm0
+	paddd	16(%esp),%xmm1
+	paddd	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	cmpl	$64,%ecx
+	jb	.L012tail
+	movdqu	(%esi),%xmm4
+	movdqu	16(%esi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+	leal	64(%esi),%esi
+	movdqu	%xmm0,(%edi)
+	movdqu	%xmm1,16(%edi)
+	movdqu	%xmm2,32(%edi)
+	movdqu	%xmm3,48(%edi)
+	leal	64(%edi),%edi
+	subl	$64,%ecx
+	jnz	.L011outer1x
+	jmp	.L009done
+.L012tail:
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	xorl	%ebp,%ebp
+.L013tail_loop:
+	movb	(%esp,%ebp,1),%al
+	movb	(%esi,%ebp,1),%dl
+	leal	1(%ebp),%ebp
+	xorb	%dl,%al
+	movb	%al,-1(%edi,%ebp,1)
+	decl	%ecx
+	jnz	.L013tail_loop
+.L009done:
+	movl	512(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin
+.align	64
+.Lssse3_data:
+.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long	1634760805,857760878,2036477234,1797285236
+.long	0,1,2,3
+.long	4,4,4,4
+.long	1,0,0,0
+.long	4,0,0,0
+.long	0,-1,-1,-1
+.align	64
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte	114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/crypto/chacha-x86-win.asm b/gen/crypto/chacha-x86-win.asm
new file mode 100644
index 0000000..d709da0
--- /dev/null
+++ b/gen/crypto/chacha-x86-win.asm
@@ -0,0 +1,966 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_ChaCha20_ctr32_nohw
+align	16
+_ChaCha20_ctr32_nohw:
+L$_ChaCha20_ctr32_nohw_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [32+esp]
+	mov	edi,DWORD [36+esp]
+	sub	esp,132
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	mov	DWORD [80+esp],eax
+	mov	DWORD [84+esp],ebx
+	mov	DWORD [88+esp],ecx
+	mov	DWORD [92+esp],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	mov	DWORD [96+esp],eax
+	mov	DWORD [100+esp],ebx
+	mov	DWORD [104+esp],ecx
+	mov	DWORD [108+esp],edx
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	sub	eax,1
+	mov	DWORD [112+esp],eax
+	mov	DWORD [116+esp],ebx
+	mov	DWORD [120+esp],ecx
+	mov	DWORD [124+esp],edx
+	jmp	NEAR L$000entry
+align	16
+L$001outer_loop:
+	mov	DWORD [156+esp],ebx
+	mov	DWORD [152+esp],eax
+	mov	DWORD [160+esp],ecx
+L$000entry:
+	mov	eax,1634760805
+	mov	DWORD [4+esp],857760878
+	mov	DWORD [8+esp],2036477234
+	mov	DWORD [12+esp],1797285236
+	mov	ebx,DWORD [84+esp]
+	mov	ebp,DWORD [88+esp]
+	mov	ecx,DWORD [104+esp]
+	mov	esi,DWORD [108+esp]
+	mov	edx,DWORD [116+esp]
+	mov	edi,DWORD [120+esp]
+	mov	DWORD [20+esp],ebx
+	mov	DWORD [24+esp],ebp
+	mov	DWORD [40+esp],ecx
+	mov	DWORD [44+esp],esi
+	mov	DWORD [52+esp],edx
+	mov	DWORD [56+esp],edi
+	mov	ebx,DWORD [92+esp]
+	mov	edi,DWORD [124+esp]
+	mov	edx,DWORD [112+esp]
+	mov	ebp,DWORD [80+esp]
+	mov	ecx,DWORD [96+esp]
+	mov	esi,DWORD [100+esp]
+	add	edx,1
+	mov	DWORD [28+esp],ebx
+	mov	DWORD [60+esp],edi
+	mov	DWORD [112+esp],edx
+	mov	ebx,10
+	jmp	NEAR L$002loop
+align	16
+L$002loop:
+	add	eax,ebp
+	mov	DWORD [128+esp],ebx
+	mov	ebx,ebp
+	xor	edx,eax
+	rol	edx,16
+	add	ecx,edx
+	xor	ebx,ecx
+	mov	edi,DWORD [52+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [20+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [esp],eax
+	rol	edx,8
+	mov	eax,DWORD [4+esp]
+	add	ecx,edx
+	mov	DWORD [48+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	mov	DWORD [32+esp],ecx
+	rol	edi,16
+	mov	DWORD [16+esp],ebx
+	add	esi,edi
+	mov	ecx,DWORD [40+esp]
+	xor	ebp,esi
+	mov	edx,DWORD [56+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [24+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [4+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [8+esp]
+	add	esi,edi
+	mov	DWORD [52+esp],edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	mov	DWORD [36+esp],esi
+	rol	edx,16
+	mov	DWORD [20+esp],ebp
+	add	ecx,edx
+	mov	esi,DWORD [44+esp]
+	xor	ebx,ecx
+	mov	edi,DWORD [60+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [28+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [8+esp],eax
+	rol	edx,8
+	mov	eax,DWORD [12+esp]
+	add	ecx,edx
+	mov	DWORD [56+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	rol	edi,16
+	mov	DWORD [24+esp],ebx
+	add	esi,edi
+	xor	ebp,esi
+	rol	ebp,12
+	mov	ebx,DWORD [20+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [12+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [esp]
+	add	esi,edi
+	mov	edx,edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	rol	edx,16
+	mov	DWORD [28+esp],ebp
+	add	ecx,edx
+	xor	ebx,ecx
+	mov	edi,DWORD [48+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [24+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [esp],eax
+	rol	edx,8
+	mov	eax,DWORD [4+esp]
+	add	ecx,edx
+	mov	DWORD [60+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	mov	DWORD [40+esp],ecx
+	rol	edi,16
+	mov	DWORD [20+esp],ebx
+	add	esi,edi
+	mov	ecx,DWORD [32+esp]
+	xor	ebp,esi
+	mov	edx,DWORD [52+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [28+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [4+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [8+esp]
+	add	esi,edi
+	mov	DWORD [48+esp],edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	mov	DWORD [44+esp],esi
+	rol	edx,16
+	mov	DWORD [24+esp],ebp
+	add	ecx,edx
+	mov	esi,DWORD [36+esp]
+	xor	ebx,ecx
+	mov	edi,DWORD [56+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [16+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [8+esp],eax
+	rol	edx,8
+	mov	eax,DWORD [12+esp]
+	add	ecx,edx
+	mov	DWORD [52+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	rol	edi,16
+	mov	DWORD [28+esp],ebx
+	add	esi,edi
+	xor	ebp,esi
+	mov	edx,DWORD [48+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [128+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [12+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [esp]
+	add	esi,edi
+	mov	DWORD [56+esp],edi
+	xor	ebp,esi
+	rol	ebp,7
+	dec	ebx
+	jnz	NEAR L$002loop
+	mov	ebx,DWORD [160+esp]
+	add	eax,1634760805
+	add	ebp,DWORD [80+esp]
+	add	ecx,DWORD [96+esp]
+	add	esi,DWORD [100+esp]
+	cmp	ebx,64
+	jb	NEAR L$003tail
+	mov	ebx,DWORD [156+esp]
+	add	edx,DWORD [112+esp]
+	add	edi,DWORD [120+esp]
+	xor	eax,DWORD [ebx]
+	xor	ebp,DWORD [16+ebx]
+	mov	DWORD [esp],eax
+	mov	eax,DWORD [152+esp]
+	xor	ecx,DWORD [32+ebx]
+	xor	esi,DWORD [36+ebx]
+	xor	edx,DWORD [48+ebx]
+	xor	edi,DWORD [56+ebx]
+	mov	DWORD [16+eax],ebp
+	mov	DWORD [32+eax],ecx
+	mov	DWORD [36+eax],esi
+	mov	DWORD [48+eax],edx
+	mov	DWORD [56+eax],edi
+	mov	ebp,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	esi,DWORD [12+esp]
+	mov	edx,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	add	ebp,857760878
+	add	ecx,2036477234
+	add	esi,1797285236
+	add	edx,DWORD [84+esp]
+	add	edi,DWORD [88+esp]
+	xor	ebp,DWORD [4+ebx]
+	xor	ecx,DWORD [8+ebx]
+	xor	esi,DWORD [12+ebx]
+	xor	edx,DWORD [20+ebx]
+	xor	edi,DWORD [24+ebx]
+	mov	DWORD [4+eax],ebp
+	mov	DWORD [8+eax],ecx
+	mov	DWORD [12+eax],esi
+	mov	DWORD [20+eax],edx
+	mov	DWORD [24+eax],edi
+	mov	ebp,DWORD [28+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	esi,DWORD [44+esp]
+	mov	edx,DWORD [52+esp]
+	mov	edi,DWORD [60+esp]
+	add	ebp,DWORD [92+esp]
+	add	ecx,DWORD [104+esp]
+	add	esi,DWORD [108+esp]
+	add	edx,DWORD [116+esp]
+	add	edi,DWORD [124+esp]
+	xor	ebp,DWORD [28+ebx]
+	xor	ecx,DWORD [40+ebx]
+	xor	esi,DWORD [44+ebx]
+	xor	edx,DWORD [52+ebx]
+	xor	edi,DWORD [60+ebx]
+	lea	ebx,[64+ebx]
+	mov	DWORD [28+eax],ebp
+	mov	ebp,DWORD [esp]
+	mov	DWORD [40+eax],ecx
+	mov	ecx,DWORD [160+esp]
+	mov	DWORD [44+eax],esi
+	mov	DWORD [52+eax],edx
+	mov	DWORD [60+eax],edi
+	mov	DWORD [eax],ebp
+	lea	eax,[64+eax]
+	sub	ecx,64
+	jnz	NEAR L$001outer_loop
+	jmp	NEAR L$004done
+L$003tail:
+	add	edx,DWORD [112+esp]
+	add	edi,DWORD [120+esp]
+	mov	DWORD [esp],eax
+	mov	DWORD [16+esp],ebp
+	mov	DWORD [32+esp],ecx
+	mov	DWORD [36+esp],esi
+	mov	DWORD [48+esp],edx
+	mov	DWORD [56+esp],edi
+	mov	ebp,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	esi,DWORD [12+esp]
+	mov	edx,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	add	ebp,857760878
+	add	ecx,2036477234
+	add	esi,1797285236
+	add	edx,DWORD [84+esp]
+	add	edi,DWORD [88+esp]
+	mov	DWORD [4+esp],ebp
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],esi
+	mov	DWORD [20+esp],edx
+	mov	DWORD [24+esp],edi
+	mov	ebp,DWORD [28+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	esi,DWORD [44+esp]
+	mov	edx,DWORD [52+esp]
+	mov	edi,DWORD [60+esp]
+	add	ebp,DWORD [92+esp]
+	add	ecx,DWORD [104+esp]
+	add	esi,DWORD [108+esp]
+	add	edx,DWORD [116+esp]
+	add	edi,DWORD [124+esp]
+	mov	DWORD [28+esp],ebp
+	mov	ebp,DWORD [156+esp]
+	mov	DWORD [40+esp],ecx
+	mov	ecx,DWORD [152+esp]
+	mov	DWORD [44+esp],esi
+	xor	esi,esi
+	mov	DWORD [52+esp],edx
+	mov	DWORD [60+esp],edi
+	xor	eax,eax
+	xor	edx,edx
+L$005tail_loop:
+	mov	al,BYTE [ebp*1+esi]
+	mov	dl,BYTE [esi*1+esp]
+	lea	esi,[1+esi]
+	xor	al,dl
+	mov	BYTE [esi*1+ecx-1],al
+	dec	ebx
+	jnz	NEAR L$005tail_loop
+L$004done:
+	add	esp,132
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_ChaCha20_ctr32_ssse3
+align	16
+_ChaCha20_ctr32_ssse3:
+L$_ChaCha20_ctr32_ssse3_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	call	L$pic_point
+L$pic_point:
+	pop	eax
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	ecx,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,524
+	and	esp,-64
+	mov	DWORD [512+esp],ebp
+	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
+	movdqu	xmm3,[ebx]
+	cmp	ecx,256
+	jb	NEAR L$0061x
+	mov	DWORD [516+esp],edx
+	mov	DWORD [520+esp],ebx
+	sub	ecx,256
+	lea	ebp,[384+esp]
+	movdqu	xmm7,[edx]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	paddd	xmm0,[48+eax]
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	psubd	xmm0,[64+eax]
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[64+ebp],xmm0
+	movdqa	[80+ebp],xmm1
+	movdqa	[96+ebp],xmm2
+	movdqa	[112+ebp],xmm3
+	movdqu	xmm3,[16+edx]
+	movdqa	[ebp-64],xmm4
+	movdqa	[ebp-48],xmm5
+	movdqa	[ebp-32],xmm6
+	movdqa	[ebp-16],xmm7
+	movdqa	xmm7,[32+eax]
+	lea	ebx,[128+esp]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[ebp],xmm0
+	movdqa	[16+ebp],xmm1
+	movdqa	[32+ebp],xmm2
+	movdqa	[48+ebp],xmm3
+	movdqa	[ebp-128],xmm4
+	movdqa	[ebp-112],xmm5
+	movdqa	[ebp-96],xmm6
+	movdqa	[ebp-80],xmm7
+	lea	esi,[128+esi]
+	lea	edi,[128+edi]
+	jmp	NEAR L$007outer_loop
+align	16
+L$007outer_loop:
+	movdqa	xmm1,[ebp-112]
+	movdqa	xmm2,[ebp-96]
+	movdqa	xmm3,[ebp-80]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm6,[ebp-32]
+	movdqa	xmm7,[ebp-16]
+	movdqa	[ebx-112],xmm1
+	movdqa	[ebx-96],xmm2
+	movdqa	[ebx-80],xmm3
+	movdqa	[ebx-48],xmm5
+	movdqa	[ebx-32],xmm6
+	movdqa	[ebx-16],xmm7
+	movdqa	xmm2,[32+ebp]
+	movdqa	xmm3,[48+ebp]
+	movdqa	xmm4,[64+ebp]
+	movdqa	xmm5,[80+ebp]
+	movdqa	xmm6,[96+ebp]
+	movdqa	xmm7,[112+ebp]
+	paddd	xmm4,[64+eax]
+	movdqa	[32+ebx],xmm2
+	movdqa	[48+ebx],xmm3
+	movdqa	[64+ebx],xmm4
+	movdqa	[80+ebx],xmm5
+	movdqa	[96+ebx],xmm6
+	movdqa	[112+ebx],xmm7
+	movdqa	[64+ebp],xmm4
+	movdqa	xmm0,[ebp-128]
+	movdqa	xmm6,xmm4
+	movdqa	xmm3,[ebp-64]
+	movdqa	xmm4,[ebp]
+	movdqa	xmm5,[16+ebp]
+	mov	edx,10
+	nop
+align	16
+L$008loop:
+	paddd	xmm0,xmm3
+	movdqa	xmm2,xmm3
+	pxor	xmm6,xmm0
+	pshufb	xmm6,[eax]
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-48]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[80+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[64+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-64],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[32+ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-32]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[96+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[80+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[16+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-48],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[48+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-16]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[112+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[96+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-32],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-48]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	xmm6,xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-16],xmm3
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-32]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[64+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[112+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[32+ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-48],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-16]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[80+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[64+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[48+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-32],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[16+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-64]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[96+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[80+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-16],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[64+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[96+ebx],xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	por	xmm3,xmm1
+	dec	edx
+	jnz	NEAR L$008loop
+	movdqa	[ebx-64],xmm3
+	movdqa	[ebx],xmm4
+	movdqa	[16+ebx],xmm5
+	movdqa	[64+ebx],xmm6
+	movdqa	[96+ebx],xmm7
+	movdqa	xmm1,[ebx-112]
+	movdqa	xmm2,[ebx-96]
+	movdqa	xmm3,[ebx-80]
+	paddd	xmm0,[ebp-128]
+	paddd	xmm1,[ebp-112]
+	paddd	xmm2,[ebp-96]
+	paddd	xmm3,[ebp-80]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx-64]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[ebx-48]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[ebx-32]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[ebx-16]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp-64]
+	paddd	xmm1,[ebp-48]
+	paddd	xmm2,[ebp-32]
+	paddd	xmm3,[ebp-16]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[16+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[32+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[48+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp]
+	paddd	xmm1,[16+ebp]
+	paddd	xmm2,[32+ebp]
+	paddd	xmm3,[48+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[64+ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[80+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[96+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[112+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[64+ebp]
+	paddd	xmm1,[80+ebp]
+	paddd	xmm2,[96+ebp]
+	paddd	xmm3,[112+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[208+esi]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm1
+	pxor	xmm6,xmm2
+	pxor	xmm7,xmm3
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[208+edi]
+	sub	ecx,256
+	jnc	NEAR L$007outer_loop
+	add	ecx,256
+	jz	NEAR L$009done
+	mov	ebx,DWORD [520+esp]
+	lea	esi,[esi-128]
+	mov	edx,DWORD [516+esp]
+	lea	edi,[edi-128]
+	movd	xmm2,DWORD [64+ebp]
+	movdqu	xmm3,[ebx]
+	paddd	xmm2,[96+eax]
+	pand	xmm3,[112+eax]
+	por	xmm3,xmm2
+L$0061x:
+	movdqa	xmm0,[32+eax]
+	movdqu	xmm1,[edx]
+	movdqu	xmm2,[16+edx]
+	movdqa	xmm6,[eax]
+	movdqa	xmm7,[16+eax]
+	mov	DWORD [48+esp],ebp
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	mov	edx,10
+	jmp	NEAR L$010loop1x
+align	16
+L$011outer1x:
+	movdqa	xmm3,[80+eax]
+	movdqa	xmm0,[esp]
+	movdqa	xmm1,[16+esp]
+	movdqa	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	mov	edx,10
+	movdqa	[48+esp],xmm3
+	jmp	NEAR L$010loop1x
+align	16
+L$010loop1x:
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,57
+	pshufd	xmm3,xmm3,147
+	nop
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,147
+	pshufd	xmm3,xmm3,57
+	dec	edx
+	jnz	NEAR L$010loop1x
+	paddd	xmm0,[esp]
+	paddd	xmm1,[16+esp]
+	paddd	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	cmp	ecx,64
+	jb	NEAR L$012tail
+	movdqu	xmm4,[esi]
+	movdqu	xmm5,[16+esi]
+	pxor	xmm0,xmm4
+	movdqu	xmm4,[32+esi]
+	pxor	xmm1,xmm5
+	movdqu	xmm5,[48+esi]
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm5
+	lea	esi,[64+esi]
+	movdqu	[edi],xmm0
+	movdqu	[16+edi],xmm1
+	movdqu	[32+edi],xmm2
+	movdqu	[48+edi],xmm3
+	lea	edi,[64+edi]
+	sub	ecx,64
+	jnz	NEAR L$011outer1x
+	jmp	NEAR L$009done
+L$012tail:
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	xor	eax,eax
+	xor	edx,edx
+	xor	ebp,ebp
+L$013tail_loop:
+	mov	al,BYTE [ebp*1+esp]
+	mov	dl,BYTE [ebp*1+esi]
+	lea	ebp,[1+ebp]
+	xor	al,dl
+	mov	BYTE [ebp*1+edi-1],al
+	dec	ecx
+	jnz	NEAR L$013tail_loop
+L$009done:
+	mov	esp,DWORD [512+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$ssse3_data:
+db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+dd	1634760805,857760878,2036477234,1797285236
+dd	0,1,2,3
+dd	4,4,4,4
+dd	1,0,0,0
+dd	4,0,0,0
+dd	0,-1,-1,-1
+align	64
+db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+db	114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha-x86_64-apple.S b/gen/crypto/chacha-x86_64-apple.S
new file mode 100644
index 0000000..a5e1207
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-apple.S
@@ -0,0 +1,1604 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+.section	__DATA,__const
+.p2align	6
+L$zero:
+.long	0,0,0,0
+L$one:
+.long	1,0,0,0
+L$inc:
+.long	0,1,2,3
+L$four:
+.long	4,4,4,4
+L$incy:
+.long	0,2,4,6,1,3,5,7
+L$eight:
+.long	8,8,8,8,8,8,8,8
+L$rot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+L$rot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+L$sigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.p2align	6
+L$zeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	_ChaCha20_ctr32_nohw
+.private_extern _ChaCha20_ctr32_nohw
+
+.p2align	6
+_ChaCha20_ctr32_nohw:
+
+_CET_ENDBR
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$64+24,%rsp
+
+L$ctr32_body:
+
+
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	L$one(%rip),%xmm4
+
+
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	%rdx,%rbp
+	jmp	L$oop_outer
+
+.p2align	5
+L$oop_outer:
+	movl	$0x61707865,%eax
+	movl	$0x3320646e,%ebx
+	movl	$0x79622d32,%ecx
+	movl	$0x6b206574,%edx
+	movl	16(%rsp),%r8d
+	movl	20(%rsp),%r9d
+	movl	24(%rsp),%r10d
+	movl	28(%rsp),%r11d
+	movd	%xmm3,%r12d
+	movl	52(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	60(%rsp),%r15d
+
+	movq	%rbp,64+0(%rsp)
+	movl	$10,%ebp
+	movq	%rsi,64+8(%rsp)
+.byte	102,72,15,126,214
+	movq	%rdi,64+16(%rsp)
+	movq	%rsi,%rdi
+	shrq	$32,%rdi
+	jmp	L$oop
+
+.p2align	5
+L$oop:
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$16,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$16,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$12,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$12,%r9d
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$8,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$8,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$7,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$7,%r9d
+	movl	%esi,32(%rsp)
+	movl	%edi,36(%rsp)
+	movl	40(%rsp),%esi
+	movl	44(%rsp),%edi
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$16,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$16,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$12,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$12,%r11d
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$8,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$8,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$7,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$7,%r11d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$16,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$16,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$12,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$12,%r10d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$8,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$8,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$7,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$7,%r10d
+	movl	%esi,40(%rsp)
+	movl	%edi,44(%rsp)
+	movl	32(%rsp),%esi
+	movl	36(%rsp),%edi
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$16,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$16,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$12,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$12,%r8d
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$8,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$8,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$7,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$7,%r8d
+	decl	%ebp
+	jnz	L$oop
+	movl	%edi,36(%rsp)
+	movl	%esi,32(%rsp)
+	movq	64(%rsp),%rbp
+	movdqa	%xmm2,%xmm1
+	movq	64+8(%rsp),%rsi
+	paddd	%xmm4,%xmm3
+	movq	64+16(%rsp),%rdi
+
+	addl	$0x61707865,%eax
+	addl	$0x3320646e,%ebx
+	addl	$0x79622d32,%ecx
+	addl	$0x6b206574,%edx
+	addl	16(%rsp),%r8d
+	addl	20(%rsp),%r9d
+	addl	24(%rsp),%r10d
+	addl	28(%rsp),%r11d
+	addl	48(%rsp),%r12d
+	addl	52(%rsp),%r13d
+	addl	56(%rsp),%r14d
+	addl	60(%rsp),%r15d
+	paddd	32(%rsp),%xmm1
+
+	cmpq	$64,%rbp
+	jb	L$tail
+
+	xorl	0(%rsi),%eax
+	xorl	4(%rsi),%ebx
+	xorl	8(%rsi),%ecx
+	xorl	12(%rsi),%edx
+	xorl	16(%rsi),%r8d
+	xorl	20(%rsi),%r9d
+	xorl	24(%rsi),%r10d
+	xorl	28(%rsi),%r11d
+	movdqu	32(%rsi),%xmm0
+	xorl	48(%rsi),%r12d
+	xorl	52(%rsi),%r13d
+	xorl	56(%rsi),%r14d
+	xorl	60(%rsi),%r15d
+	leaq	64(%rsi),%rsi
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,32(%rsp)
+	movd	%xmm3,48(%rsp)
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movl	%r12d,48(%rdi)
+	movl	%r13d,52(%rdi)
+	movl	%r14d,56(%rdi)
+	movl	%r15d,60(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rbp
+	jnz	L$oop_outer
+
+	jmp	L$done
+
+.p2align	4
+L$tail:
+	movl	%eax,0(%rsp)
+	movl	%ebx,4(%rsp)
+	xorq	%rbx,%rbx
+	movl	%ecx,8(%rsp)
+	movl	%edx,12(%rsp)
+	movl	%r8d,16(%rsp)
+	movl	%r9d,20(%rsp)
+	movl	%r10d,24(%rsp)
+	movl	%r11d,28(%rsp)
+	movdqa	%xmm1,32(%rsp)
+	movl	%r12d,48(%rsp)
+	movl	%r13d,52(%rsp)
+	movl	%r14d,56(%rsp)
+	movl	%r15d,60(%rsp)
+
+L$oop_tail:
+	movzbl	(%rsi,%rbx,1),%eax
+	movzbl	(%rsp,%rbx,1),%edx
+	leaq	1(%rbx),%rbx
+	xorl	%edx,%eax
+	movb	%al,-1(%rdi,%rbx,1)
+	decq	%rbp
+	jnz	L$oop_tail
+
+L$done:
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
+L$no_data:
+	ret
+
+
+.globl	_ChaCha20_ctr32_ssse3
+.private_extern _ChaCha20_ctr32_ssse3
+
+.p2align	5
+_ChaCha20_ctr32_ssse3:
+
+_CET_ENDBR
+	movq	%rsp,%r9
+
+	subq	$64+8,%rsp
+	movdqa	L$sigma(%rip),%xmm0
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	L$rot16(%rip),%xmm6
+	movdqa	L$rot24(%rip),%xmm7
+
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	$10,%r8
+	jmp	L$oop_ssse3
+
+.p2align	5
+L$oop_outer_ssse3:
+	movdqa	L$one(%rip),%xmm3
+	movdqa	0(%rsp),%xmm0
+	movdqa	16(%rsp),%xmm1
+	movdqa	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+	movq	$10,%r8
+	movdqa	%xmm3,48(%rsp)
+	jmp	L$oop_ssse3
+
+.p2align	5
+L$oop_ssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	L$oop_ssse3
+	paddd	0(%rsp),%xmm0
+	paddd	16(%rsp),%xmm1
+	paddd	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+
+	cmpq	$64,%rdx
+	jb	L$tail_ssse3
+
+	movdqu	0(%rsi),%xmm4
+	movdqu	16(%rsi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%rsi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%rsi),%xmm5
+	leaq	64(%rsi),%rsi
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm1,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rdx
+	jnz	L$oop_outer_ssse3
+
+	jmp	L$done_ssse3
+
+.p2align	4
+L$tail_ssse3:
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	xorq	%r8,%r8
+
+L$oop_tail_ssse3:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	L$oop_tail_ssse3
+
+L$done_ssse3:
+	leaq	(%r9),%rsp
+
+L$ssse3_epilogue:
+	ret
+
+
+.globl	_ChaCha20_ctr32_ssse3_4x
+.private_extern _ChaCha20_ctr32_ssse3_4x
+
+.p2align	5
+_ChaCha20_ctr32_ssse3_4x:
+
+_CET_ENDBR
+	movq	%rsp,%r9
+
+	movq	%r10,%r11
+	subq	$0x140+8,%rsp
+	movdqa	L$sigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	L$rot16(%rip),%r10
+	leaq	L$rot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	L$inc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	L$oop_enter4x
+
+.p2align	5
+L$oop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	L$four(%rip),%xmm0
+
+L$oop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r10),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	L$oop4x
+
+.p2align	5
+L$oop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,199
+.byte	102,15,56,0,207
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,215
+.byte	102,15,56,0,223
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,223
+.byte	102,15,56,0,199
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,222
+.byte	102,15,56,0,198
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,207
+.byte	102,15,56,0,215
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	L$oop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	L$tail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	L$oop_outer4x
+
+	jmp	L$done4x
+
+L$tail4x:
+	cmpq	$192,%rdx
+	jae	L$192_or_more4x
+	cmpq	$128,%rdx
+	jae	L$128_or_more4x
+	cmpq	$64,%rdx
+	jae	L$64_or_more4x
+
+
+	xorq	%r10,%r10
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	L$done4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	L$done4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	L$oop_tail4x
+
+.p2align	5
+L$192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	L$done4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+L$oop_tail4x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	L$oop_tail4x
+
+L$done4x:
+	leaq	(%r9),%rsp
+
+L$4x_epilogue:
+	ret
+
+
+.globl	_ChaCha20_ctr32_avx2
+.private_extern _ChaCha20_ctr32_avx2
+
+.p2align	5
+_ChaCha20_ctr32_avx2:
+
+_CET_ENDBR
+	movq	%rsp,%r9
+
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	L$sigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	L$rot16(%rip),%r10
+	leaq	L$rot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	L$incy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	L$oop_enter8x
+
+.p2align	5
+L$oop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	L$eight(%rip),%ymm4,%ymm4
+
+L$oop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r10),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	L$oop8x
+
+.p2align	5
+L$oop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	L$oop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	L$tail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	L$oop_outer8x
+
+	jmp	L$done8x
+
+L$tail8x:
+	cmpq	$448,%rdx
+	jae	L$448_or_more8x
+	cmpq	$384,%rdx
+	jae	L$384_or_more8x
+	cmpq	$320,%rdx
+	jae	L$320_or_more8x
+	cmpq	$256,%rdx
+	jae	L$256_or_more8x
+	cmpq	$192,%rdx
+	jae	L$192_or_more8x
+	cmpq	$128,%rdx
+	jae	L$128_or_more8x
+	cmpq	$64,%rdx
+	jae	L$64_or_more8x
+
+	xorq	%r10,%r10
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	L$done8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	L$done8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	L$done8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	L$done8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	L$done8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	L$done8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	L$oop_tail8x
+
+.p2align	5
+L$448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	L$done8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+L$oop_tail8x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	L$oop_tail8x
+
+L$done8x:
+	vzeroall
+	leaq	(%r9),%rsp
+
+L$8x_epilogue:
+	ret
+
+
+#endif
diff --git a/gen/crypto/chacha-x86_64-linux.S b/gen/crypto/chacha-x86_64-linux.S
new file mode 100644
index 0000000..9dbf7d1
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-linux.S
@@ -0,0 +1,1610 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+.section	.rodata
+.align	64
+.Lzero:
+.long	0,0,0,0
+.Lone:
+.long	1,0,0,0
+.Linc:
+.long	0,1,2,3
+.Lfour:
+.long	4,4,4,4
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text	
+.globl	ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,@function
+.align	64
+ChaCha20_ctr32_nohw:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-56
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	88
+.Lctr32_body:
+
+
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lone(%rip),%xmm4
+
+
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	%rdx,%rbp
+	jmp	.Loop_outer
+
+.align	32
+.Loop_outer:
+	movl	$0x61707865,%eax
+	movl	$0x3320646e,%ebx
+	movl	$0x79622d32,%ecx
+	movl	$0x6b206574,%edx
+	movl	16(%rsp),%r8d
+	movl	20(%rsp),%r9d
+	movl	24(%rsp),%r10d
+	movl	28(%rsp),%r11d
+	movd	%xmm3,%r12d
+	movl	52(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	60(%rsp),%r15d
+
+	movq	%rbp,64+0(%rsp)
+	movl	$10,%ebp
+	movq	%rsi,64+8(%rsp)
+.byte	102,72,15,126,214
+	movq	%rdi,64+16(%rsp)
+	movq	%rsi,%rdi
+	shrq	$32,%rdi
+	jmp	.Loop
+
+.align	32
+.Loop:
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$16,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$16,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$12,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$12,%r9d
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$8,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$8,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$7,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$7,%r9d
+	movl	%esi,32(%rsp)
+	movl	%edi,36(%rsp)
+	movl	40(%rsp),%esi
+	movl	44(%rsp),%edi
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$16,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$16,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$12,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$12,%r11d
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$8,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$8,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$7,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$7,%r11d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$16,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$16,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$12,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$12,%r10d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$8,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$8,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$7,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$7,%r10d
+	movl	%esi,40(%rsp)
+	movl	%edi,44(%rsp)
+	movl	32(%rsp),%esi
+	movl	36(%rsp),%edi
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$16,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$16,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$12,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$12,%r8d
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$8,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$8,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$7,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$7,%r8d
+	decl	%ebp
+	jnz	.Loop
+	movl	%edi,36(%rsp)
+	movl	%esi,32(%rsp)
+	movq	64(%rsp),%rbp
+	movdqa	%xmm2,%xmm1
+	movq	64+8(%rsp),%rsi
+	paddd	%xmm4,%xmm3
+	movq	64+16(%rsp),%rdi
+
+	addl	$0x61707865,%eax
+	addl	$0x3320646e,%ebx
+	addl	$0x79622d32,%ecx
+	addl	$0x6b206574,%edx
+	addl	16(%rsp),%r8d
+	addl	20(%rsp),%r9d
+	addl	24(%rsp),%r10d
+	addl	28(%rsp),%r11d
+	addl	48(%rsp),%r12d
+	addl	52(%rsp),%r13d
+	addl	56(%rsp),%r14d
+	addl	60(%rsp),%r15d
+	paddd	32(%rsp),%xmm1
+
+	cmpq	$64,%rbp
+	jb	.Ltail
+
+	xorl	0(%rsi),%eax
+	xorl	4(%rsi),%ebx
+	xorl	8(%rsi),%ecx
+	xorl	12(%rsi),%edx
+	xorl	16(%rsi),%r8d
+	xorl	20(%rsi),%r9d
+	xorl	24(%rsi),%r10d
+	xorl	28(%rsi),%r11d
+	movdqu	32(%rsi),%xmm0
+	xorl	48(%rsi),%r12d
+	xorl	52(%rsi),%r13d
+	xorl	56(%rsi),%r14d
+	xorl	60(%rsi),%r15d
+	leaq	64(%rsi),%rsi
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,32(%rsp)
+	movd	%xmm3,48(%rsp)
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movl	%r12d,48(%rdi)
+	movl	%r13d,52(%rdi)
+	movl	%r14d,56(%rdi)
+	movl	%r15d,60(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rbp
+	jnz	.Loop_outer
+
+	jmp	.Ldone
+
+.align	16
+.Ltail:
+	movl	%eax,0(%rsp)
+	movl	%ebx,4(%rsp)
+	xorq	%rbx,%rbx
+	movl	%ecx,8(%rsp)
+	movl	%edx,12(%rsp)
+	movl	%r8d,16(%rsp)
+	movl	%r9d,20(%rsp)
+	movl	%r10d,24(%rsp)
+	movl	%r11d,28(%rsp)
+	movdqa	%xmm1,32(%rsp)
+	movl	%r12d,48(%rsp)
+	movl	%r13d,52(%rsp)
+	movl	%r14d,56(%rsp)
+	movl	%r15d,60(%rsp)
+
+.Loop_tail:
+	movzbl	(%rsi,%rbx,1),%eax
+	movzbl	(%rsp,%rbx,1),%edx
+	leaq	1(%rbx),%rbx
+	xorl	%edx,%eax
+	movb	%al,-1(%rdi,%rbx,1)
+	decq	%rbp
+	jnz	.Loop_tail
+
+.Ldone:
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+.cfi_restore	r15
+	movq	-40(%rsi),%r14
+.cfi_restore	r14
+	movq	-32(%rsi),%r13
+.cfi_restore	r13
+	movq	-24(%rsi),%r12
+.cfi_restore	r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	rbx
+	leaq	(%rsi),%rsp
+.cfi_adjust_cfa_offset	-136
+.Lno_data:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+.globl	ChaCha20_ctr32_ssse3
+.hidden ChaCha20_ctr32_ssse3
+.type	ChaCha20_ctr32_ssse3,@function
+.align	32
+ChaCha20_ctr32_ssse3:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	subq	$64+8,%rsp
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	$10,%r8
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_outer_ssse3:
+	movdqa	.Lone(%rip),%xmm3
+	movdqa	0(%rsp),%xmm0
+	movdqa	16(%rsp),%xmm1
+	movdqa	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+	movq	$10,%r8
+	movdqa	%xmm3,48(%rsp)
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_ssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_ssse3
+	paddd	0(%rsp),%xmm0
+	paddd	16(%rsp),%xmm1
+	paddd	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+
+	cmpq	$64,%rdx
+	jb	.Ltail_ssse3
+
+	movdqu	0(%rsi),%xmm4
+	movdqu	16(%rsi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%rsi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%rsi),%xmm5
+	leaq	64(%rsi),%rsi
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm1,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rdx
+	jnz	.Loop_outer_ssse3
+
+	jmp	.Ldone_ssse3
+
+.align	16
+.Ltail_ssse3:
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	xorq	%r8,%r8
+
+.Loop_tail_ssse3:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_ssse3
+
+.Ldone_ssse3:
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.Lssse3_epilogue:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
+.globl	ChaCha20_ctr32_ssse3_4x
+.hidden ChaCha20_ctr32_ssse3_4x
+.type	ChaCha20_ctr32_ssse3_4x,@function
+.align	32
+ChaCha20_ctr32_ssse3_4x:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	movq	%r10,%r11
+	subq	$0x140+8,%rsp
+	movdqa	.Lsigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	.Linc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	.Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r10),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	.Loop4x
+
+.align	32
+.Loop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,199
+.byte	102,15,56,0,207
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,215
+.byte	102,15,56,0,223
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,223
+.byte	102,15,56,0,199
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,222
+.byte	102,15,56,0,198
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,207
+.byte	102,15,56,0,215
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	.Loop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	.Ltail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	.Loop_outer4x
+
+	jmp	.Ldone4x
+
+.Ltail4x:
+	cmpq	$192,%rdx
+	jae	.L192_or_more4x
+	cmpq	$128,%rdx
+	jae	.L128_or_more4x
+	cmpq	$64,%rdx
+	jae	.L64_or_more4x
+
+
+	xorq	%r10,%r10
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	.Ldone4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+.Loop_tail4x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail4x
+
+.Ldone4x:
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L4x_epilogue:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
+.globl	ChaCha20_ctr32_avx2
+.hidden ChaCha20_ctr32_avx2
+.type	ChaCha20_ctr32_avx2,@function
+.align	32
+ChaCha20_ctr32_avx2:
+.cfi_startproc	
+_CET_ENDBR
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	.Lsigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	.Lincy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	.Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r10),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	.Loop8x
+
+.align	32
+.Loop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	.Loop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	.Ltail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8x
+
+	jmp	.Ldone8x
+
+.Ltail8x:
+	cmpq	$448,%rdx
+	jae	.L448_or_more8x
+	cmpq	$384,%rdx
+	jae	.L384_or_more8x
+	cmpq	$320,%rdx
+	jae	.L320_or_more8x
+	cmpq	$256,%rdx
+	jae	.L256_or_more8x
+	cmpq	$192,%rdx
+	jae	.L192_or_more8x
+	cmpq	$128,%rdx
+	jae	.L128_or_more8x
+	cmpq	$64,%rdx
+	jae	.L64_or_more8x
+
+	xorq	%r10,%r10
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	.Ldone8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	.Ldone8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	.Ldone8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	.Ldone8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	.Ldone8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	.Ldone8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	.Ldone8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+.Loop_tail8x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L8x_epilogue:
+	ret
+.cfi_endproc	
+.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
+#endif
diff --git a/gen/crypto/chacha-x86_64-win.asm b/gen/crypto/chacha-x86_64-win.asm
new file mode 100644
index 0000000..14f2395
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-win.asm
@@ -0,0 +1,1916 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$zero:
+	DD	0,0,0,0
+$L$one:
+	DD	1,0,0,0
+$L$inc:
+	DD	0,1,2,3
+$L$four:
+	DD	4,4,4,4
+$L$incy:
+	DD	0,2,4,6,1,3,5,7
+$L$eight:
+	DD	8,8,8,8,8,8,8,8
+$L$rot16:
+	DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
+$L$rot24:
+	DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
+$L$sigma:
+	DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
+	DB	0
+ALIGN	64
+$L$zeroz:
+	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+	DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+	DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
+	DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
+	DB	108,46,111,114,103,62,0
+section	.text
+
+global	ChaCha20_ctr32_nohw
+
+ALIGN	64
+ChaCha20_ctr32_nohw:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_nohw:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,64+24
+
+$L$ctr32_body:
+
+
+	movdqu	xmm1,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm4,XMMWORD[$L$one]
+
+
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	mov	rbp,rdx
+	jmp	NEAR $L$oop_outer
+
+ALIGN	32
+$L$oop_outer:
+	mov	eax,0x61707865
+	mov	ebx,0x3320646e
+	mov	ecx,0x79622d32
+	mov	edx,0x6b206574
+	mov	r8d,DWORD[16+rsp]
+	mov	r9d,DWORD[20+rsp]
+	mov	r10d,DWORD[24+rsp]
+	mov	r11d,DWORD[28+rsp]
+	movd	r12d,xmm3
+	mov	r13d,DWORD[52+rsp]
+	mov	r14d,DWORD[56+rsp]
+	mov	r15d,DWORD[60+rsp]
+
+	mov	QWORD[((64+0))+rsp],rbp
+	mov	ebp,10
+	mov	QWORD[((64+8))+rsp],rsi
+DB	102,72,15,126,214
+	mov	QWORD[((64+16))+rsp],rdi
+	mov	rdi,rsi
+	shr	rdi,32
+	jmp	NEAR $L$oop
+
+ALIGN	32
+$L$oop:
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,16
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,16
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,12
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,12
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,8
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,8
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,7
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,7
+	mov	DWORD[32+rsp],esi
+	mov	DWORD[36+rsp],edi
+	mov	esi,DWORD[40+rsp]
+	mov	edi,DWORD[44+rsp]
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,16
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,16
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,12
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,12
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,8
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,8
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,7
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,7
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,16
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,16
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,12
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,12
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,8
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,8
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,7
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,7
+	mov	DWORD[40+rsp],esi
+	mov	DWORD[44+rsp],edi
+	mov	esi,DWORD[32+rsp]
+	mov	edi,DWORD[36+rsp]
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,16
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,16
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,12
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,12
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,8
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,8
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,7
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,7
+	dec	ebp
+	jnz	NEAR $L$oop
+	mov	DWORD[36+rsp],edi
+	mov	DWORD[32+rsp],esi
+	mov	rbp,QWORD[64+rsp]
+	movdqa	xmm1,xmm2
+	mov	rsi,QWORD[((64+8))+rsp]
+	paddd	xmm3,xmm4
+	mov	rdi,QWORD[((64+16))+rsp]
+
+	add	eax,0x61707865
+	add	ebx,0x3320646e
+	add	ecx,0x79622d32
+	add	edx,0x6b206574
+	add	r8d,DWORD[16+rsp]
+	add	r9d,DWORD[20+rsp]
+	add	r10d,DWORD[24+rsp]
+	add	r11d,DWORD[28+rsp]
+	add	r12d,DWORD[48+rsp]
+	add	r13d,DWORD[52+rsp]
+	add	r14d,DWORD[56+rsp]
+	add	r15d,DWORD[60+rsp]
+	paddd	xmm1,XMMWORD[32+rsp]
+
+	cmp	rbp,64
+	jb	NEAR $L$tail
+
+	xor	eax,DWORD[rsi]
+	xor	ebx,DWORD[4+rsi]
+	xor	ecx,DWORD[8+rsi]
+	xor	edx,DWORD[12+rsi]
+	xor	r8d,DWORD[16+rsi]
+	xor	r9d,DWORD[20+rsi]
+	xor	r10d,DWORD[24+rsi]
+	xor	r11d,DWORD[28+rsi]
+	movdqu	xmm0,XMMWORD[32+rsi]
+	xor	r12d,DWORD[48+rsi]
+	xor	r13d,DWORD[52+rsi]
+	xor	r14d,DWORD[56+rsi]
+	xor	r15d,DWORD[60+rsi]
+	lea	rsi,[64+rsi]
+	pxor	xmm0,xmm1
+
+	movdqa	XMMWORD[32+rsp],xmm2
+	movd	DWORD[48+rsp],xmm3
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	movdqu	XMMWORD[32+rdi],xmm0
+	mov	DWORD[48+rdi],r12d
+	mov	DWORD[52+rdi],r13d
+	mov	DWORD[56+rdi],r14d
+	mov	DWORD[60+rdi],r15d
+	lea	rdi,[64+rdi]
+
+	sub	rbp,64
+	jnz	NEAR $L$oop_outer
+
+	jmp	NEAR $L$done
+
+ALIGN	16
+$L$tail:
+	mov	DWORD[rsp],eax
+	mov	DWORD[4+rsp],ebx
+	xor	rbx,rbx
+	mov	DWORD[8+rsp],ecx
+	mov	DWORD[12+rsp],edx
+	mov	DWORD[16+rsp],r8d
+	mov	DWORD[20+rsp],r9d
+	mov	DWORD[24+rsp],r10d
+	mov	DWORD[28+rsp],r11d
+	movdqa	XMMWORD[32+rsp],xmm1
+	mov	DWORD[48+rsp],r12d
+	mov	DWORD[52+rsp],r13d
+	mov	DWORD[56+rsp],r14d
+	mov	DWORD[60+rsp],r15d
+
+$L$oop_tail:
+	movzx	eax,BYTE[rbx*1+rsi]
+	movzx	edx,BYTE[rbx*1+rsp]
+	lea	rbx,[1+rbx]
+	xor	eax,edx
+	mov	BYTE[((-1))+rbx*1+rdi],al
+	dec	rbp
+	jnz	NEAR $L$oop_tail
+
+$L$done:
+	lea	rsi,[((64+24+48))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$no_data:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_nohw:
+global	ChaCha20_ctr32_ssse3
+
+ALIGN	32
+ChaCha20_ctr32_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9,rsp
+
+	sub	rsp,64+40
+	movaps	XMMWORD[(-40)+r9],xmm6
+	movaps	XMMWORD[(-24)+r9],xmm7
+$L$ssse3_body:
+	movdqa	xmm0,XMMWORD[$L$sigma]
+	movdqu	xmm1,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm6,XMMWORD[$L$rot16]
+	movdqa	xmm7,XMMWORD[$L$rot24]
+
+	movdqa	XMMWORD[rsp],xmm0
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	mov	r8,10
+	jmp	NEAR $L$oop_ssse3
+
+ALIGN	32
+$L$oop_outer_ssse3:
+	movdqa	xmm3,XMMWORD[$L$one]
+	movdqa	xmm0,XMMWORD[rsp]
+	movdqa	xmm1,XMMWORD[16+rsp]
+	movdqa	xmm2,XMMWORD[32+rsp]
+	paddd	xmm3,XMMWORD[48+rsp]
+	mov	r8,10
+	movdqa	XMMWORD[48+rsp],xmm3
+	jmp	NEAR $L$oop_ssse3
+
+ALIGN	32
+$L$oop_ssse3:
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,57
+	pshufd	xmm3,xmm3,147
+	nop
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,147
+	pshufd	xmm3,xmm3,57
+	dec	r8
+	jnz	NEAR $L$oop_ssse3
+	paddd	xmm0,XMMWORD[rsp]
+	paddd	xmm1,XMMWORD[16+rsp]
+	paddd	xmm2,XMMWORD[32+rsp]
+	paddd	xmm3,XMMWORD[48+rsp]
+
+	cmp	rdx,64
+	jb	NEAR $L$tail_ssse3
+
+	movdqu	xmm4,XMMWORD[rsi]
+	movdqu	xmm5,XMMWORD[16+rsi]
+	pxor	xmm0,xmm4
+	movdqu	xmm4,XMMWORD[32+rsi]
+	pxor	xmm1,xmm5
+	movdqu	xmm5,XMMWORD[48+rsi]
+	lea	rsi,[64+rsi]
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm5
+
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+	lea	rdi,[64+rdi]
+
+	sub	rdx,64
+	jnz	NEAR $L$oop_outer_ssse3
+
+	jmp	NEAR $L$done_ssse3
+
+ALIGN	16
+$L$tail_ssse3:
+	movdqa	XMMWORD[rsp],xmm0
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	xor	r8,r8
+
+$L$oop_tail_ssse3:
+	movzx	eax,BYTE[r8*1+rsi]
+	movzx	ecx,BYTE[r8*1+rsp]
+	lea	r8,[1+r8]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r8*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail_ssse3
+
+$L$done_ssse3:
+	movaps	xmm6,XMMWORD[((-40))+r9]
+	movaps	xmm7,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$ssse3_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_ssse3:
+global	ChaCha20_ctr32_ssse3_4x
+
+ALIGN	32
+ChaCha20_ctr32_ssse3_4x:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_ssse3_4x:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9,rsp
+
+	mov	r11,r10
+	sub	rsp,0x140+168
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
+	movdqa	xmm11,XMMWORD[$L$sigma]
+	movdqu	xmm15,XMMWORD[rcx]
+	movdqu	xmm7,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	pshufd	xmm8,xmm11,0x00
+	pshufd	xmm9,xmm11,0x55
+	movdqa	XMMWORD[64+rsp],xmm8
+	pshufd	xmm10,xmm11,0xaa
+	movdqa	XMMWORD[80+rsp],xmm9
+	pshufd	xmm11,xmm11,0xff
+	movdqa	XMMWORD[96+rsp],xmm10
+	movdqa	XMMWORD[112+rsp],xmm11
+
+	pshufd	xmm12,xmm15,0x00
+	pshufd	xmm13,xmm15,0x55
+	movdqa	XMMWORD[(128-256)+rcx],xmm12
+	pshufd	xmm14,xmm15,0xaa
+	movdqa	XMMWORD[(144-256)+rcx],xmm13
+	pshufd	xmm15,xmm15,0xff
+	movdqa	XMMWORD[(160-256)+rcx],xmm14
+	movdqa	XMMWORD[(176-256)+rcx],xmm15
+
+	pshufd	xmm4,xmm7,0x00
+	pshufd	xmm5,xmm7,0x55
+	movdqa	XMMWORD[(192-256)+rcx],xmm4
+	pshufd	xmm6,xmm7,0xaa
+	movdqa	XMMWORD[(208-256)+rcx],xmm5
+	pshufd	xmm7,xmm7,0xff
+	movdqa	XMMWORD[(224-256)+rcx],xmm6
+	movdqa	XMMWORD[(240-256)+rcx],xmm7
+
+	pshufd	xmm0,xmm3,0x00
+	pshufd	xmm1,xmm3,0x55
+	paddd	xmm0,XMMWORD[$L$inc]
+	pshufd	xmm2,xmm3,0xaa
+	movdqa	XMMWORD[(272-256)+rcx],xmm1
+	pshufd	xmm3,xmm3,0xff
+	movdqa	XMMWORD[(288-256)+rcx],xmm2
+	movdqa	XMMWORD[(304-256)+rcx],xmm3
+
+	jmp	NEAR $L$oop_enter4x
+
+ALIGN	32
+$L$oop_outer4x:
+	movdqa	xmm8,XMMWORD[64+rsp]
+	movdqa	xmm9,XMMWORD[80+rsp]
+	movdqa	xmm10,XMMWORD[96+rsp]
+	movdqa	xmm11,XMMWORD[112+rsp]
+	movdqa	xmm12,XMMWORD[((128-256))+rcx]
+	movdqa	xmm13,XMMWORD[((144-256))+rcx]
+	movdqa	xmm14,XMMWORD[((160-256))+rcx]
+	movdqa	xmm15,XMMWORD[((176-256))+rcx]
+	movdqa	xmm4,XMMWORD[((192-256))+rcx]
+	movdqa	xmm5,XMMWORD[((208-256))+rcx]
+	movdqa	xmm6,XMMWORD[((224-256))+rcx]
+	movdqa	xmm7,XMMWORD[((240-256))+rcx]
+	movdqa	xmm0,XMMWORD[((256-256))+rcx]
+	movdqa	xmm1,XMMWORD[((272-256))+rcx]
+	movdqa	xmm2,XMMWORD[((288-256))+rcx]
+	movdqa	xmm3,XMMWORD[((304-256))+rcx]
+	paddd	xmm0,XMMWORD[$L$four]
+
+$L$oop_enter4x:
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm7
+	movdqa	xmm7,XMMWORD[r10]
+	mov	eax,10
+	movdqa	XMMWORD[(256-256)+rcx],xmm0
+	jmp	NEAR $L$oop4x
+
+ALIGN	32
+$L$oop4x:
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,199
+DB	102,15,56,0,207
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm6,xmm12
+	pslld	xmm12,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm13
+	pslld	xmm13,12
+	por	xmm12,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm13,xmm7
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,198
+DB	102,15,56,0,206
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm7,xmm12
+	pslld	xmm12,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm13
+	pslld	xmm13,7
+	por	xmm12,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm13,xmm6
+	movdqa	XMMWORD[rsp],xmm4
+	movdqa	XMMWORD[16+rsp],xmm5
+	movdqa	xmm4,XMMWORD[32+rsp]
+	movdqa	xmm5,XMMWORD[48+rsp]
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,215
+DB	102,15,56,0,223
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm6,xmm14
+	pslld	xmm14,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm15
+	pslld	xmm15,12
+	por	xmm14,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm15,xmm7
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,214
+DB	102,15,56,0,222
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm7,xmm14
+	pslld	xmm14,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm15
+	pslld	xmm15,7
+	por	xmm14,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm15,xmm6
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,223
+DB	102,15,56,0,199
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm6,xmm13
+	pslld	xmm13,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm14
+	pslld	xmm14,12
+	por	xmm13,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm14,xmm7
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,222
+DB	102,15,56,0,198
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm7,xmm13
+	pslld	xmm13,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm14
+	pslld	xmm14,7
+	por	xmm13,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm14,xmm6
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm5
+	movdqa	xmm4,XMMWORD[rsp]
+	movdqa	xmm5,XMMWORD[16+rsp]
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,207
+DB	102,15,56,0,215
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm6,xmm15
+	pslld	xmm15,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm12
+	pslld	xmm12,12
+	por	xmm15,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm12,xmm7
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,206
+DB	102,15,56,0,214
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm7,xmm15
+	pslld	xmm15,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm12
+	pslld	xmm12,7
+	por	xmm15,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm12,xmm6
+	dec	eax
+	jnz	NEAR $L$oop4x
+
+	paddd	xmm8,XMMWORD[64+rsp]
+	paddd	xmm9,XMMWORD[80+rsp]
+	paddd	xmm10,XMMWORD[96+rsp]
+	paddd	xmm11,XMMWORD[112+rsp]
+
+	movdqa	xmm6,xmm8
+	punpckldq	xmm8,xmm9
+	movdqa	xmm7,xmm10
+	punpckldq	xmm10,xmm11
+	punpckhdq	xmm6,xmm9
+	punpckhdq	xmm7,xmm11
+	movdqa	xmm9,xmm8
+	punpcklqdq	xmm8,xmm10
+	movdqa	xmm11,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm9,xmm10
+	punpckhqdq	xmm11,xmm7
+	paddd	xmm12,XMMWORD[((128-256))+rcx]
+	paddd	xmm13,XMMWORD[((144-256))+rcx]
+	paddd	xmm14,XMMWORD[((160-256))+rcx]
+	paddd	xmm15,XMMWORD[((176-256))+rcx]
+
+	movdqa	XMMWORD[rsp],xmm8
+	movdqa	XMMWORD[16+rsp],xmm9
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+
+	movdqa	xmm10,xmm12
+	punpckldq	xmm12,xmm13
+	movdqa	xmm7,xmm14
+	punpckldq	xmm14,xmm15
+	punpckhdq	xmm10,xmm13
+	punpckhdq	xmm7,xmm15
+	movdqa	xmm13,xmm12
+	punpcklqdq	xmm12,xmm14
+	movdqa	xmm15,xmm10
+	punpcklqdq	xmm10,xmm7
+	punpckhqdq	xmm13,xmm14
+	punpckhqdq	xmm15,xmm7
+	paddd	xmm4,XMMWORD[((192-256))+rcx]
+	paddd	xmm5,XMMWORD[((208-256))+rcx]
+	paddd	xmm8,XMMWORD[((224-256))+rcx]
+	paddd	xmm9,XMMWORD[((240-256))+rcx]
+
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm11
+
+	movdqa	xmm14,xmm4
+	punpckldq	xmm4,xmm5
+	movdqa	xmm7,xmm8
+	punpckldq	xmm8,xmm9
+	punpckhdq	xmm14,xmm5
+	punpckhdq	xmm7,xmm9
+	movdqa	xmm5,xmm4
+	punpcklqdq	xmm4,xmm8
+	movdqa	xmm9,xmm14
+	punpcklqdq	xmm14,xmm7
+	punpckhqdq	xmm5,xmm8
+	punpckhqdq	xmm9,xmm7
+	paddd	xmm0,XMMWORD[((256-256))+rcx]
+	paddd	xmm1,XMMWORD[((272-256))+rcx]
+	paddd	xmm2,XMMWORD[((288-256))+rcx]
+	paddd	xmm3,XMMWORD[((304-256))+rcx]
+
+	movdqa	xmm8,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm8,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm8
+	punpcklqdq	xmm8,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	cmp	rdx,64*4
+	jb	NEAR $L$tail4x
+
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[48+rsp]
+	pxor	xmm11,xmm15
+	pxor	xmm2,xmm9
+	pxor	xmm7,xmm3
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*4
+	jnz	NEAR $L$oop_outer4x
+
+	jmp	NEAR $L$done4x
+
+$L$tail4x:
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more4x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more4x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more4x
+
+
+	xor	r10,r10
+
+	movdqa	XMMWORD[16+rsp],xmm12
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm0
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$64_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[16+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm13
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm5
+	sub	rdx,64
+	movdqa	XMMWORD[48+rsp],xmm1
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$128_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[32+rsp]
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm10
+	lea	rdi,[128+rdi]
+	movdqa	XMMWORD[32+rsp],xmm14
+	sub	rdx,128
+	movdqa	XMMWORD[48+rsp],xmm8
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$192_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[48+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm15
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm9
+	sub	rdx,192
+	movdqa	XMMWORD[48+rsp],xmm3
+
+$L$oop_tail4x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail4x
+
+$L$done4x:
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_ssse3_4x:
+global	ChaCha20_ctr32_avx2
+
+ALIGN	32
+ChaCha20_ctr32_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+	mov	r9,rsp
+
+	sub	rsp,0x280+168
+	and	rsp,-32
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
+	vbroadcasti128	ymm3,XMMWORD[rcx]
+	vbroadcasti128	ymm15,XMMWORD[16+rcx]
+	vbroadcasti128	ymm7,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	rax,[512+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	vpshufd	ymm8,ymm11,0x00
+	vpshufd	ymm9,ymm11,0x55
+	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
+	vpshufd	ymm10,ymm11,0xaa
+	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
+	vpshufd	ymm11,ymm11,0xff
+	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
+	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
+
+	vpshufd	ymm0,ymm3,0x00
+	vpshufd	ymm1,ymm3,0x55
+	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
+	vpshufd	ymm2,ymm3,0xaa
+	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
+	vpshufd	ymm3,ymm3,0xff
+	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
+	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
+
+	vpshufd	ymm12,ymm15,0x00
+	vpshufd	ymm13,ymm15,0x55
+	vmovdqa	YMMWORD[(384-512)+rax],ymm12
+	vpshufd	ymm14,ymm15,0xaa
+	vmovdqa	YMMWORD[(416-512)+rax],ymm13
+	vpshufd	ymm15,ymm15,0xff
+	vmovdqa	YMMWORD[(448-512)+rax],ymm14
+	vmovdqa	YMMWORD[(480-512)+rax],ymm15
+
+	vpshufd	ymm4,ymm7,0x00
+	vpshufd	ymm5,ymm7,0x55
+	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
+	vpshufd	ymm6,ymm7,0xaa
+	vmovdqa	YMMWORD[(544-512)+rax],ymm5
+	vpshufd	ymm7,ymm7,0xff
+	vmovdqa	YMMWORD[(576-512)+rax],ymm6
+	vmovdqa	YMMWORD[(608-512)+rax],ymm7
+
+	jmp	NEAR $L$oop_enter8x
+
+ALIGN	32
+$L$oop_outer8x:
+	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
+	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
+	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
+	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
+	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
+	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
+	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
+	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
+	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
+	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
+	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
+	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
+	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
+	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
+	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
+	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
+	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
+
+$L$oop_enter8x:
+	vmovdqa	YMMWORD[64+rsp],ymm14
+	vmovdqa	YMMWORD[96+rsp],ymm15
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vmovdqa	YMMWORD[(512-512)+rax],ymm4
+	mov	eax,10
+	jmp	NEAR $L$oop8x
+
+ALIGN	32
+$L$oop8x:
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm14,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm14,ymm0
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm15,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm15,ymm1
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm15,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm15,ymm0
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm14,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm14,ymm1
+	vmovdqa	YMMWORD[rsp],ymm12
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[64+rsp]
+	vmovdqa	ymm13,YMMWORD[96+rsp]
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm14,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm14,ymm2
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm15,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm15,ymm3
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm15,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm15,ymm2
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm14,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm14,ymm3
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm14,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm14,ymm1
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm15,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm15,ymm2
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm15,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm15,ymm1
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm14,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm14,ymm2
+	vmovdqa	YMMWORD[64+rsp],ymm12
+	vmovdqa	YMMWORD[96+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[rsp]
+	vmovdqa	ymm13,YMMWORD[32+rsp]
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm14,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm14,ymm3
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm15,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm15,ymm0
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm15,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm15,ymm3
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm14,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm14,ymm0
+	dec	eax
+	jnz	NEAR $L$oop8x
+
+	lea	rax,[512+rsp]
+	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
+	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
+	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
+	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
+
+	vpunpckldq	ymm14,ymm8,ymm9
+	vpunpckldq	ymm15,ymm10,ymm11
+	vpunpckhdq	ymm8,ymm8,ymm9
+	vpunpckhdq	ymm10,ymm10,ymm11
+	vpunpcklqdq	ymm9,ymm14,ymm15
+	vpunpckhqdq	ymm14,ymm14,ymm15
+	vpunpcklqdq	ymm11,ymm8,ymm10
+	vpunpckhqdq	ymm8,ymm8,ymm10
+	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
+	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
+	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
+	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
+
+	vpunpckldq	ymm10,ymm0,ymm1
+	vpunpckldq	ymm15,ymm2,ymm3
+	vpunpckhdq	ymm0,ymm0,ymm1
+	vpunpckhdq	ymm2,ymm2,ymm3
+	vpunpcklqdq	ymm1,ymm10,ymm15
+	vpunpckhqdq	ymm10,ymm10,ymm15
+	vpunpcklqdq	ymm3,ymm0,ymm2
+	vpunpckhqdq	ymm0,ymm0,ymm2
+	vperm2i128	ymm15,ymm9,ymm1,0x20
+	vperm2i128	ymm1,ymm9,ymm1,0x31
+	vperm2i128	ymm9,ymm14,ymm10,0x20
+	vperm2i128	ymm10,ymm14,ymm10,0x31
+	vperm2i128	ymm14,ymm11,ymm3,0x20
+	vperm2i128	ymm3,ymm11,ymm3,0x31
+	vperm2i128	ymm11,ymm8,ymm0,0x20
+	vperm2i128	ymm0,ymm8,ymm0,0x31
+	vmovdqa	YMMWORD[rsp],ymm15
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	vmovdqa	ymm15,YMMWORD[64+rsp]
+	vmovdqa	ymm9,YMMWORD[96+rsp]
+
+	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
+	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
+	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
+	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
+
+	vpunpckldq	ymm2,ymm12,ymm13
+	vpunpckldq	ymm8,ymm15,ymm9
+	vpunpckhdq	ymm12,ymm12,ymm13
+	vpunpckhdq	ymm15,ymm15,ymm9
+	vpunpcklqdq	ymm13,ymm2,ymm8
+	vpunpckhqdq	ymm2,ymm2,ymm8
+	vpunpcklqdq	ymm9,ymm12,ymm15
+	vpunpckhqdq	ymm12,ymm12,ymm15
+	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
+	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
+	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
+	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
+
+	vpunpckldq	ymm15,ymm4,ymm5
+	vpunpckldq	ymm8,ymm6,ymm7
+	vpunpckhdq	ymm4,ymm4,ymm5
+	vpunpckhdq	ymm6,ymm6,ymm7
+	vpunpcklqdq	ymm5,ymm15,ymm8
+	vpunpckhqdq	ymm15,ymm15,ymm8
+	vpunpcklqdq	ymm7,ymm4,ymm6
+	vpunpckhqdq	ymm4,ymm4,ymm6
+	vperm2i128	ymm8,ymm13,ymm5,0x20
+	vperm2i128	ymm5,ymm13,ymm5,0x31
+	vperm2i128	ymm13,ymm2,ymm15,0x20
+	vperm2i128	ymm15,ymm2,ymm15,0x31
+	vperm2i128	ymm2,ymm9,ymm7,0x20
+	vperm2i128	ymm7,ymm9,ymm7,0x31
+	vperm2i128	ymm9,ymm12,ymm4,0x20
+	vperm2i128	ymm4,ymm12,ymm4,0x31
+	vmovdqa	ymm6,YMMWORD[rsp]
+	vmovdqa	ymm12,YMMWORD[32+rsp]
+
+	cmp	rdx,64*8
+	jb	NEAR $L$tail8x
+
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm12,ymm12,YMMWORD[rsi]
+	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm12
+	vmovdqu	YMMWORD[32+rdi],ymm13
+	vmovdqu	YMMWORD[64+rdi],ymm10
+	vmovdqu	YMMWORD[96+rdi],ymm15
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm14,ymm14,YMMWORD[rsi]
+	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm14
+	vmovdqu	YMMWORD[32+rdi],ymm2
+	vmovdqu	YMMWORD[64+rdi],ymm3
+	vmovdqu	YMMWORD[96+rdi],ymm7
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm11,ymm11,YMMWORD[rsi]
+	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm11
+	vmovdqu	YMMWORD[32+rdi],ymm9
+	vmovdqu	YMMWORD[64+rdi],ymm0
+	vmovdqu	YMMWORD[96+rdi],ymm4
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*8
+	jnz	NEAR $L$oop_outer8x
+
+	jmp	NEAR $L$done8x
+
+$L$tail8x:
+	cmp	rdx,448
+	jae	NEAR $L$448_or_more8x
+	cmp	rdx,384
+	jae	NEAR $L$384_or_more8x
+	cmp	rdx,320
+	jae	NEAR $L$320_or_more8x
+	cmp	rdx,256
+	jae	NEAR $L$256_or_more8x
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more8x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more8x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more8x
+
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm6
+	vmovdqa	YMMWORD[32+rsp],ymm8
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$64_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	je	NEAR $L$done8x
+
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm1
+	lea	rdi,[64+rdi]
+	sub	rdx,64
+	vmovdqa	YMMWORD[32+rsp],ymm5
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$128_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	je	NEAR $L$done8x
+
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm12
+	lea	rdi,[128+rdi]
+	sub	rdx,128
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$192_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	je	NEAR $L$done8x
+
+	lea	rsi,[192+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm10
+	lea	rdi,[192+rdi]
+	sub	rdx,192
+	vmovdqa	YMMWORD[32+rsp],ymm15
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$256_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	je	NEAR $L$done8x
+
+	lea	rsi,[256+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm14
+	lea	rdi,[256+rdi]
+	sub	rdx,256
+	vmovdqa	YMMWORD[32+rsp],ymm2
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$320_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	je	NEAR $L$done8x
+
+	lea	rsi,[320+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm3
+	lea	rdi,[320+rdi]
+	sub	rdx,320
+	vmovdqa	YMMWORD[32+rsp],ymm7
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$384_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	je	NEAR $L$done8x
+
+	lea	rsi,[384+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm11
+	lea	rdi,[384+rdi]
+	sub	rdx,384
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$448_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	vmovdqu	YMMWORD[384+rdi],ymm11
+	vmovdqu	YMMWORD[416+rdi],ymm9
+	je	NEAR $L$done8x
+
+	lea	rsi,[448+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm0
+	lea	rdi,[448+rdi]
+	sub	rdx,448
+	vmovdqa	YMMWORD[32+rsp],ymm4
+
+$L$oop_tail8x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail8x
+
+$L$done8x:
+	vzeroall
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$SEH_end_ChaCha20_ctr32_avx2:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	lea	r10,[$L$ctr32_body]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$no_data]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[((64+24+48))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	ret
+
+
+
+ALIGN	16
+ssse3_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-40))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,4
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+	DD	$L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ChaCha20_ctr32_nohw:
+	DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ctr32_ssse3:
+	DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ctr32_ssse3_4x:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_ctr32_avx2:
+	DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha20_poly1305_armv8-apple.S b/gen/crypto/chacha20_poly1305_armv8-apple.S
new file mode 100644
index 0000000..04a1e22
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-apple.S
@@ -0,0 +1,3009 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+.section	__TEXT,__const
+
+.align	7
+Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long	1,2,3,4
+Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+
+.align	6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, Lpoly_hash_intro
+	ret
+
+Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+	cbz	x4, Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	_chacha20_poly1305_seal
+.private_extern	_chacha20_poly1305_seal
+
+.align	6
+_chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	Lseal_main_loop
+
+Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	Lseal_tail
+
+Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	Lseal_tail_64
+
+Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+	cbz	x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+	cbz	x4, Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+	b	Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	_chacha20_poly1305_open
+.private_extern	_chacha20_poly1305_open
+
+.align	6
+_chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	Lopen_tail
+
+	adrp	x11, Lchacha20_consts@PAGE
+	add	x11, x11, Lchacha20_consts@PAGEOFF
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, Lopen_main_loop_rounds_short
+
+.align	5
+Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	Lopen_main_loop
+
+Lopen_tail:
+
+	cbz	x2, Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	Lopen_tail_64
+	cmp	x2, #128
+	b.le	Lopen_tail_128
+
+Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+	cbz	x4, Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	Lopen_tail_64_store
+
+Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_128_rounds
+	cbz	x4, Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	Lopen_tail_64_store
+
+Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_64_rounds
+	cbz	x4, Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	Lopen_tail_64_store
+
+Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_16_store
+
+Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_128_store:
+	cmp	x2, #64
+	b.lt	Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+Lopen_128_hash_64:
+	cbz	x4, Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_128_hash_64
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/crypto/chacha20_poly1305_armv8-linux.S b/gen/crypto/chacha20_poly1305_armv8-linux.S
new file mode 100644
index 0000000..7d2db8d
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-linux.S
@@ -0,0 +1,3009 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+.section	.rodata
+
+.align	7
+.Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Linc:
+.long	1,2,3,4
+.Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.type	.Lpoly_hash_ad_internal,%function
+.align	6
+.Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, .Lpoly_hash_intro
+	ret
+
+.Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	.Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	.Lpoly_hash_ad_internal
+
+.Lpoly_hash_ad_tail:
+	cbz	x4, .Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+.Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	.Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+.size	.Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	chacha20_poly1305_seal
+.hidden	chacha20_poly1305_seal
+.type	chacha20_poly1305_seal,%function
+.align	6
+chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	.Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+.Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	.Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+.Lseal_main_loop:
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+.Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	.Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	.Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	.Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	.Lseal_main_loop
+
+.Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	.Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	.Lseal_tail
+
+.Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	.Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	.Lseal_tail_64
+
+.Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, .Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+.Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	.Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+.Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	.Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+.Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	.Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lseal_hash_extra:
+	cbz	x4, .Lseal_finalize
+
+.Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	.Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	.Lseal_hash_extra_loop
+
+.Lseal_hash_extra_tail:
+	cbz	x4, .Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+.Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	.Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+.Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+.Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+	b	.Lseal_tail
+.cfi_endproc
+.size	chacha20_poly1305_seal,.-chacha20_poly1305_seal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+.hidden	chacha20_poly1305_open
+.type	chacha20_poly1305_open,%function
+.align	6
+chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	.Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+.Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+.Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+.Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	.Lopen_tail
+
+	adrp	x11, .Lchacha20_consts
+	add	x11, x11, :lo12:.Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, .Lopen_main_loop_rounds_short
+
+.align	5
+.Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+.Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	.Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	.Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	.Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	.Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	.Lopen_main_loop
+
+.Lopen_tail:
+
+	cbz	x2, .Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	.Lopen_tail_64
+	cmp	x2, #128
+	b.le	.Lopen_tail_128
+
+.Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, .Lopen_tail_192_rounds_no_hash
+
+.Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+.Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	.Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	.Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+.Lopen_tail_192_hash:
+	cbz	x4, .Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	.Lopen_tail_192_hash
+
+.Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+.Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_128_rounds
+	cbz	x4, .Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	.Lopen_tail_128_rounds
+
+.Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+.Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_64_rounds
+	cbz	x4, .Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	.Lopen_tail_64_rounds
+
+.Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+.Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	.Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	.Lopen_tail_64_store
+
+.Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, .Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+.Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	.Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+.Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	.Lopen_tail_16_store
+
+.Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+.Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	.Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	.Lpoly_hash_ad_internal
+
+.Lopen_128_store:
+	cmp	x2, #64
+	b.lt	.Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+.Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+.Lopen_128_hash_64:
+	cbz	x4, .Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	.Lopen_128_hash_64
+.cfi_endproc
+.size	chacha20_poly1305_open,.-chacha20_poly1305_open
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/crypto/chacha20_poly1305_armv8-win.S b/gen/crypto/chacha20_poly1305_armv8-win.S
new file mode 100644
index 0000000..3314f2c
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-win.S
@@ -0,0 +1,3015 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+.section	.rodata
+
+.align	7
+Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long	1,2,3,4
+Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.def Lpoly_hash_ad_internal
+   .type 32
+.endef
+.align	6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, Lpoly_hash_intro
+	ret
+
+Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+	cbz	x4, Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	chacha20_poly1305_seal
+
+.def chacha20_poly1305_seal
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	Lseal_main_loop
+
+Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	Lseal_tail
+
+Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	Lseal_tail_64
+
+Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+	cbz	x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+	cbz	x4, Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+	b	Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+
+.def chacha20_poly1305_open
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	Lopen_tail
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, Lopen_main_loop_rounds_short
+
+.align	5
+Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	Lopen_main_loop
+
+Lopen_tail:
+
+	cbz	x2, Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	Lopen_tail_64
+	cmp	x2, #128
+	b.le	Lopen_tail_128
+
+Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+	cbz	x4, Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	Lopen_tail_64_store
+
+Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_128_rounds
+	cbz	x4, Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	Lopen_tail_64_store
+
+Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_64_rounds
+	cbz	x4, Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	Lopen_tail_64_store
+
+Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_16_store
+
+Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_128_store:
+	cmp	x2, #64
+	b.lt	Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+Lopen_128_hash_64:
+	cbz	x4, Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_128_hash_64
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S
new file mode 100644
index 0000000..e4a7202
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -0,0 +1,8875 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+chacha20_poly1305_constants:
+
+.section	__DATA,__const
+.p2align	6
+L$chacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+L$rol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+L$rol16:
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+L$avx2_init:
+.long	0,0,0,0
+L$sse_inc:
+.long	1,0,0,0
+L$avx2_inc:
+.long	2,0,0,0,2,0,0,0
+L$clamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.p2align	4
+L$and_masks:
+.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text	
+
+
+.p2align	6
+poly_hash_ad_internal:
+
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	cmpq	$13,%r8
+	jne	L$hash_ad_loop
+L$poly_fast_tls_ad:
+
+	movq	(%rcx),%r10
+	movq	5(%rcx),%r11
+	shrq	$24,%r11
+	movq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	ret
+L$hash_ad_loop:
+
+	cmpq	$16,%r8
+	jb	L$hash_ad_tail
+	addq	0+0(%rcx),%r10
+	adcq	8+0(%rcx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rcx),%rcx
+	subq	$16,%r8
+	jmp	L$hash_ad_loop
+L$hash_ad_tail:
+	cmpq	$0,%r8
+	je	L$hash_ad_done
+
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	addq	%r8,%rcx
+L$hash_ad_tail_loop:
+	shldq	$8,%r13,%r14
+	shlq	$8,%r13
+	movzbq	-1(%rcx),%r15
+	xorq	%r15,%r13
+	decq	%rcx
+	decq	%r8
+	jne	L$hash_ad_tail_loop
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$hash_ad_done:
+	ret
+
+
+
+.globl	_chacha20_poly1305_open
+.private_extern _chacha20_poly1305_open
+
+.p2align	6
+_chacha20_poly1305_open:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_open_avx2
+
+	cmpq	$128,%rbx
+	jbe	L$open_sse_128
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movq	$10,%r10
+L$open_sse_init_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jne	L$open_sse_init_rounds
+
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+
+	pand	L$clamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_sse_main_loop:
+	cmpq	$256,%rbx
+	jb	L$open_sse_tail
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+
+
+	movq	$4,%rcx
+	movq	%rsi,%r8
+L$open_sse_main_loop_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+
+	leaq	16(%r8),%r8
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%rcx
+	jge	L$open_sse_main_loop_rounds
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	cmpq	$-6,%rcx
+	jg	L$open_sse_main_loop_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	0+80(%rbp),%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	L$open_sse_main_loop
+L$open_sse_tail:
+
+	testq	%rbx,%rbx
+	jz	L$open_sse_finalize
+	cmpq	$192,%rbx
+	ja	L$open_sse_tail_256
+	cmpq	$128,%rbx
+	ja	L$open_sse_tail_192
+	cmpq	$64,%rbx
+	ja	L$open_sse_tail_128
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	cmpq	$16,%rcx
+	jb	L$open_sse_tail_64_rounds
+L$open_sse_tail_64_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+L$open_sse_tail_64_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	cmpq	$16,%rcx
+	jae	L$open_sse_tail_64_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_64_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_128:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+L$open_sse_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_128_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_192:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+	movq	%rbx,%rcx
+	movq	$160,%r8
+	cmpq	$160,%rcx
+	cmovgq	%r8,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+L$open_sse_tail_192_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_192_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_192_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_sse_tail_192_rounds
+	cmpq	$176,%rbx
+	jb	L$open_sse_tail_192_finish
+	addq	0+160(%rsi),%r10
+	adcq	8+160(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	cmpq	$192,%rbx
+	jb	L$open_sse_tail_192_finish
+	addq	0+176(%rsi),%r10
+	adcq	8+176(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_sse_tail_192_finish:
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	jmp	L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_256:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+	xorq	%r8,%r8
+L$open_sse_tail_256_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	movdqa	0+80(%rbp),%xmm11
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+	movdqa	0+80(%rbp),%xmm9
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+	movdqa	0+80(%rbp),%xmm11
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	L$rol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+	movdqa	0+80(%rbp),%xmm9
+
+	addq	$16,%r8
+	cmpq	$160,%r8
+	jb	L$open_sse_tail_256_rounds_and_x1hash
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+L$open_sse_tail_256_hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%r8
+	cmpq	%rcx,%r8
+	jb	L$open_sse_tail_256_hash
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movdqa	0+80(%rbp),%xmm12
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	leaq	192(%rdi),%rdi
+
+
+L$open_sse_tail_64_dec_loop:
+	cmpq	$16,%rbx
+	jb	L$open_sse_tail_16_init
+	subq	$16,%rbx
+	movdqu	(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	jmp	L$open_sse_tail_64_dec_loop
+L$open_sse_tail_16_init:
+	movdqa	%xmm0,%xmm1
+
+
+L$open_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	L$open_sse_finalize
+
+
+
+	pxor	%xmm3,%xmm3
+	leaq	-1(%rsi,%rbx,1),%rsi
+	movq	%rbx,%r8
+L$open_sse_tail_16_compose:
+	pslldq	$1,%xmm3
+	pinsrb	$0,(%rsi),%xmm3
+	subq	$1,%rsi
+	subq	$1,%r8
+	jnz	L$open_sse_tail_16_compose
+
+.byte	102,73,15,126,221
+	pextrq	$1,%xmm3,%r14
+
+	pxor	%xmm1,%xmm3
+
+
+L$open_sse_tail_16_extract:
+	pextrb	$0,%xmm3,(%rdi)
+	psrldq	$1,%xmm3
+	addq	$1,%rdi
+	subq	$1,%rbx
+	jne	L$open_sse_tail_16_extract
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$open_sse_finalize:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+
+	addq	$288 + 0 + 32,%rsp
+
+
+	popq	%r9
+
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+	ret
+
+L$open_sse_128:
+
+	movdqu	L$chacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm13,%xmm15
+	movq	$10,%r10
+
+L$open_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	L$open_sse_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm9
+	paddd	%xmm11,%xmm10
+	paddd	%xmm15,%xmm13
+	paddd	L$sse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm14
+
+	pand	L$clamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_sse_128_xor_hash:
+	cmpq	$16,%rbx
+	jb	L$open_sse_tail_16
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm1
+	movdqu	%xmm1,0(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,%xmm13
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm10
+	jmp	L$open_sse_128_xor_hash
+
+
+
+
+
+
+
+
+
+.globl	_chacha20_poly1305_seal
+.private_extern _chacha20_poly1305_seal
+
+.p2align	6
+_chacha20_poly1305_seal:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_seal_avx2
+
+	cmpq	$128,%rbx
+	jbe	L$seal_sse_128
+
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm14
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm12
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+	movq	$10,%r10
+L$seal_sse_init_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jnz	L$seal_sse_init_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+
+	pand	L$clamp(%rip),%xmm3
+	movdqa	%xmm3,0+0(%rbp)
+	movdqa	%xmm7,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	cmpq	$192,%rbx
+	ja	L$seal_sse_main_init
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+L$seal_sse_main_init:
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 128(%rdi)
+	movdqu	%xmm4,16 + 128(%rdi)
+	movdqu	%xmm8,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	movq	$2,%rcx
+	movq	$8,%r8
+	cmpq	$64,%rbx
+	jbe	L$seal_sse_tail_64
+	cmpq	$128,%rbx
+	jbe	L$seal_sse_tail_128
+	cmpq	$192,%rbx
+	jbe	L$seal_sse_tail_192
+
+L$seal_sse_main_loop:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	L$sse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+.p2align	5
+L$seal_sse_main_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	L$rol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	L$rol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	leaq	16(%rdi),%rdi
+	decq	%r8
+	jge	L$seal_sse_main_rounds
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_main_rounds
+	paddd	L$chacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	movdqa	%xmm14,0+80(%rbp)
+	movdqa	%xmm14,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm14
+	pxor	%xmm3,%xmm14
+	movdqu	%xmm14,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm14
+	pxor	%xmm7,%xmm14
+	movdqu	%xmm14,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm14
+	pxor	%xmm11,%xmm14
+	movdqu	%xmm14,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm14
+	pxor	%xmm15,%xmm14
+	movdqu	%xmm14,48 + 0(%rdi)
+
+	movdqa	0+80(%rbp),%xmm14
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	cmpq	$256,%rbx
+	ja	L$seal_sse_main_loop_xor
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+L$seal_sse_main_loop_xor:
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	movq	$6,%rcx
+	movq	$4,%r8
+	cmpq	$192,%rbx
+	jg	L$seal_sse_main_loop
+	movq	%rbx,%rcx
+	testq	%rbx,%rbx
+	je	L$seal_sse_128_tail_hash
+	movq	$6,%rcx
+	cmpq	$128,%rbx
+	ja	L$seal_sse_tail_192
+	cmpq	$64,%rbx
+	ja	L$seal_sse_tail_128
+
+L$seal_sse_tail_64:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+L$seal_sse_tail_64_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_64_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_64_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_64_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_128:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+L$seal_sse_tail_128_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_128_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_128_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_128_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	movq	$64,%rcx
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	jmp	L$seal_sse_128_tail_hash
+
+L$seal_sse_tail_192:
+	movdqa	L$chacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	L$sse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+L$seal_sse_tail_192_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_sse_tail_192_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_sse_tail_192_rounds_and_x2hash
+	decq	%r8
+	jge	L$seal_sse_tail_192_rounds_and_x1hash
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+
+L$seal_sse_128_tail_hash:
+	cmpq	$16,%rcx
+	jb	L$seal_sse_128_tail_xor
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	leaq	16(%rdi),%rdi
+	jmp	L$seal_sse_128_tail_hash
+
+L$seal_sse_128_tail_xor:
+	cmpq	$16,%rbx
+	jb	L$seal_sse_tail_16
+	subq	$16,%rbx
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,0(%rdi)
+
+	addq	0(%rdi),%r10
+	adcq	8(%rdi),%r11
+	adcq	$1,%r12
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	jmp	L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	L$process_blocks_of_extra_in
+
+	movq	%rbx,%r8
+	movq	%rbx,%rcx
+	leaq	-1(%rsi,%rbx,1),%rsi
+	pxor	%xmm15,%xmm15
+L$seal_sse_tail_16_compose:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	decq	%rcx
+	jne	L$seal_sse_tail_16_compose
+
+
+	pxor	%xmm0,%xmm15
+
+
+	movq	%rbx,%rcx
+	movdqu	%xmm15,%xmm0
+L$seal_sse_tail_16_extract:
+	pextrb	$0,%xmm0,(%rdi)
+	psrldq	$1,%xmm0
+	addq	$1,%rdi
+	subq	$1,%rcx
+	jnz	L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	movq	288 + 0 + 32(%rsp),%r9
+	movq	56(%r9),%r14
+	movq	48(%r9),%r13
+	testq	%r14,%r14
+	jz	L$process_partial_block
+
+	movq	$16,%r15
+	subq	%rbx,%r15
+	cmpq	%r15,%r14
+
+	jge	L$load_extra_in
+	movq	%r14,%r15
+
+L$load_extra_in:
+
+
+	leaq	-1(%r13,%r15,1),%rsi
+
+
+	addq	%r15,%r13
+	subq	%r15,%r14
+	movq	%r13,48(%r9)
+	movq	%r14,56(%r9)
+
+
+
+	addq	%r15,%r8
+
+
+	pxor	%xmm11,%xmm11
+L$load_extra_load_loop:
+	pslldq	$1,%xmm11
+	pinsrb	$0,(%rsi),%xmm11
+	leaq	-1(%rsi),%rsi
+	subq	$1,%r15
+	jnz	L$load_extra_load_loop
+
+
+
+
+	movq	%rbx,%r15
+
+L$load_extra_shift_loop:
+	pslldq	$1,%xmm11
+	subq	$1,%r15
+	jnz	L$load_extra_shift_loop
+
+
+
+
+	leaq	L$and_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+
+
+	por	%xmm11,%xmm15
+
+
+
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$process_blocks_of_extra_in:
+
+	movq	288+32+0 (%rsp),%r9
+	movq	48(%r9),%rsi
+	movq	56(%r9),%r8
+	movq	%r8,%rcx
+	shrq	$4,%r8
+
+L$process_extra_hash_loop:
+	jz	process_extra_in_trailer
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rsi),%rsi
+	subq	$1,%r8
+	jmp	L$process_extra_hash_loop
+process_extra_in_trailer:
+	andq	$15,%rcx
+	movq	%rcx,%rbx
+	jz	L$do_length_block
+	leaq	-1(%rsi,%rcx,1),%rsi
+
+L$process_extra_in_trailer_load:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	subq	$1,%rcx
+	jnz	L$process_extra_in_trailer_load
+
+L$process_partial_block:
+
+	leaq	L$and_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+L$do_length_block:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+
+	addq	$288 + 0 + 32,%rsp
+
+
+	popq	%r9
+
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+
+	popq	%r14
+
+	popq	%r13
+
+	popq	%r12
+
+	popq	%rbx
+
+	popq	%rbp
+
+	ret
+
+L$seal_sse_128:
+
+	movdqu	L$chacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm14
+	movdqa	%xmm14,%xmm12
+	paddd	L$sse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	L$sse_inc(%rip),%xmm13
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	movq	$10,%r10
+
+L$seal_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	L$rol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	L$rol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	L$rol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	L$seal_sse_128_rounds
+	paddd	L$chacha20_consts(%rip),%xmm0
+	paddd	L$chacha20_consts(%rip),%xmm1
+	paddd	L$chacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm8
+	paddd	%xmm11,%xmm9
+	paddd	%xmm15,%xmm12
+	paddd	L$sse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm13
+
+	pand	L$clamp(%rip),%xmm2
+	movdqa	%xmm2,0+0(%rbp)
+	movdqa	%xmm6,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	jmp	L$seal_sse_128_tail_xor
+
+
+
+
+
+.p2align	6
+chacha20_poly1305_open_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	L$open_avx2_192
+	cmpq	$320,%rbx
+	jbe	L$open_avx2_320
+
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	movq	$10,%r10
+L$open_avx2_init_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	decq	%r10
+	jne	L$open_avx2_init_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	xorq	%rcx,%rcx
+L$open_avx2_init_hash:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%rcx
+	cmpq	$64,%rcx
+	jne	L$open_avx2_init_hash
+
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm4,32(%rdi)
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	subq	$64,%rbx
+L$open_avx2_main_loop:
+
+	cmpq	$512,%rbx
+	jb	L$open_avx2_main_loop_done
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+L$open_avx2_main_loop_rounds:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rsi,%rcx,1),%r10
+	adcq	8+16(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rsi,%rcx,1),%r10
+	adcq	8+32(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rcx),%rcx
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	cmpq	$60*8,%rcx
+	jne	L$open_avx2_main_loop_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+60*8(%rsi),%r10
+	adcq	8+60*8(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	addq	0+60*8+16(%rsi),%r10
+	adcq	8+60*8+16(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	leaq	512(%rdi),%rdi
+	subq	$512,%rbx
+	jmp	L$open_avx2_main_loop
+L$open_avx2_main_loop_done:
+	testq	%rbx,%rbx
+	vzeroupper
+	je	L$open_sse_finalize
+
+	cmpq	$384,%rbx
+	ja	L$open_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	L$open_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	L$open_avx2_tail_256
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	testq	%rcx,%rcx
+	je	L$open_avx2_tail_128_rounds
+L$open_avx2_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$open_avx2_tail_128_rounds:
+	addq	$16,%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	L$open_avx2_tail_128_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_256:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$128,%rcx
+	shrq	$4,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+L$open_avx2_tail_256_rounds_and_x1hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+L$open_avx2_tail_256_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+
+	incq	%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_256_rounds_and_x1hash
+	cmpq	$10,%r8
+	jne	L$open_avx2_tail_256_rounds
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+L$open_avx2_tail_256_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	L$open_avx2_tail_256_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	L$open_avx2_tail_256_hash
+L$open_avx2_tail_256_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	subq	$128,%rbx
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_384:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$256,%rcx
+	shrq	$4,%rcx
+	addq	$6,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+L$open_avx2_tail_384_rounds_and_x2hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+L$open_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+	incq	%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	L$open_avx2_tail_384_rounds_and_x2hash
+	cmpq	$10,%r8
+	jne	L$open_avx2_tail_384_rounds_and_x1hash
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+L$open_avx2_384_tail_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	L$open_avx2_384_tail_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	L$open_avx2_384_tail_hash
+L$open_avx2_384_tail_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_512:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+	movq	%rsi,%r8
+L$open_avx2_tail_512_rounds_and_x2hash:
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+L$open_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+16(%r8),%r10
+	adcq	8+16(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%r8),%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	incq	%rcx
+	cmpq	$4,%rcx
+	jl	L$open_avx2_tail_512_rounds_and_x2hash
+	cmpq	$10,%rcx
+	jne	L$open_avx2_tail_512_rounds_and_x1hash
+	movq	%rbx,%rcx
+	subq	$384,%rcx
+	andq	$-16,%rcx
+L$open_avx2_tail_512_hash:
+	testq	%rcx,%rcx
+	je	L$open_avx2_tail_512_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	subq	$16,%rcx
+	jmp	L$open_avx2_tail_512_hash
+L$open_avx2_tail_512_done:
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	384(%rsi),%rsi
+	leaq	384(%rdi),%rdi
+	subq	$384,%rbx
+L$open_avx2_tail_128_xor:
+	cmpq	$32,%rbx
+	jb	L$open_avx2_tail_32_xor
+	subq	$32,%rbx
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	jmp	L$open_avx2_tail_128_xor
+L$open_avx2_tail_32_xor:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	L$open_avx2_exit
+	subq	$16,%rbx
+
+	vpxor	(%rsi),%xmm0,%xmm1
+	vmovdqu	%xmm1,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
+	vmovdqa	%xmm0,%xmm1
+L$open_avx2_exit:
+	vzeroupper
+	jmp	L$open_sse_tail_16
+
+L$open_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+L$open_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	L$open_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+L$open_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+L$open_avx2_short_hash_and_xor_loop:
+	cmpq	$32,%rbx
+	jb	L$open_avx2_short_tail_32
+	subq	$32,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rsi),%r10
+	adcq	8+16(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	L$open_avx2_short_hash_and_xor_loop
+L$open_avx2_short_tail_32:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	L$open_avx2_short_tail_32_exit
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm1
+L$open_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	L$open_sse_tail_16
+
+L$open_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+L$open_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	L$open_avx2_320_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	L$open_avx2_short
+
+
+
+
+
+.p2align	6
+chacha20_poly1305_seal_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	L$seal_avx2_192
+	cmpq	$320,%rbx
+	jbe	L$seal_avx2_320
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,%ymm15
+	vpaddd	L$avx2_inc(%rip),%ymm15,%ymm14
+	vpaddd	L$avx2_inc(%rip),%ymm14,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm15,0+256(%rbp)
+	movq	$10,%r10
+L$seal_avx2_init_rounds:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%r10
+	jnz	L$seal_avx2_init_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
+	vpand	L$clamp(%rip),%ymm15,%ymm15
+	vmovdqa	%ymm15,0+0(%rbp)
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+64(%rsi),%ymm15,%ymm15
+	vpxor	32+64(%rsi),%ymm2,%ymm2
+	vpxor	64+64(%rsi),%ymm6,%ymm6
+	vpxor	96+64(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm15,0+64(%rdi)
+	vmovdqu	%ymm2,32+64(%rdi)
+	vmovdqu	%ymm6,64+64(%rdi)
+	vmovdqu	%ymm10,96+64(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+192(%rsi),%ymm15,%ymm15
+	vpxor	32+192(%rsi),%ymm1,%ymm1
+	vpxor	64+192(%rsi),%ymm5,%ymm5
+	vpxor	96+192(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm15,0+192(%rdi)
+	vmovdqu	%ymm1,32+192(%rdi)
+	vmovdqu	%ymm5,64+192(%rdi)
+	vmovdqu	%ymm9,96+192(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm15,%ymm8
+
+	leaq	320(%rsi),%rsi
+	subq	$320,%rbx
+	movq	$320,%rcx
+	cmpq	$128,%rbx
+	jbe	L$seal_avx2_short_hash_remainder
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+	vpxor	64(%rsi),%ymm8,%ymm8
+	vpxor	96(%rsi),%ymm12,%ymm12
+	vmovdqu	%ymm0,320(%rdi)
+	vmovdqu	%ymm4,352(%rdi)
+	vmovdqu	%ymm8,384(%rdi)
+	vmovdqu	%ymm12,416(%rdi)
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	movq	$8,%rcx
+	movq	$2,%r8
+	cmpq	$128,%rbx
+	jbe	L$seal_avx2_tail_128
+	cmpq	$256,%rbx
+	jbe	L$seal_avx2_tail_256
+	cmpq	$384,%rbx
+	jbe	L$seal_avx2_tail_384
+	cmpq	$512,%rbx
+	jbe	L$seal_avx2_tail_512
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+
+	subq	$16,%rdi
+	movq	$9,%rcx
+	jmp	L$seal_avx2_main_loop_rounds_entry
+.p2align	5
+L$seal_avx2_main_loop:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	movq	$10,%rcx
+.p2align	5
+L$seal_avx2_main_loop_rounds:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+L$seal_avx2_main_loop_rounds_entry:
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rdi),%r10
+	adcq	8+32(%rdi),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rdi),%rdi
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%rcx
+	jne	L$seal_avx2_main_loop_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	subq	$512,%rbx
+	cmpq	$512,%rbx
+	jg	L$seal_avx2_main_loop
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	movq	$10,%rcx
+	xorq	%r8,%r8
+
+	cmpq	$384,%rbx
+	ja	L$seal_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	L$seal_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	L$seal_avx2_tail_256
+
+L$seal_avx2_tail_128:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_128_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_128_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	L$seal_avx2_short_loop
+
+L$seal_avx2_tail_256:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+L$seal_avx2_tail_256_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_256_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$128,%rcx
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_384:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+L$seal_avx2_tail_384_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_384_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$256,%rcx
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_512:
+	vmovdqa	L$chacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	L$avx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_512_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+L$seal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	L$rol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	L$rol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	L$seal_avx2_tail_512_rounds_and_3xhash
+	decq	%r8
+	jge	L$seal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$384,%rcx
+	leaq	384(%rsi),%rsi
+	subq	$384,%rbx
+	jmp	L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+L$seal_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	L$rol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	L$seal_avx2_320_rounds
+	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	L$seal_avx2_short
+
+L$seal_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+L$seal_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	L$rol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	L$rol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	L$seal_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	L$clamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+L$seal_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	xorq	%rcx,%rcx
+L$seal_avx2_short_hash_remainder:
+	cmpq	$16,%rcx
+	jb	L$seal_avx2_short_loop
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	addq	$16,%rdi
+	jmp	L$seal_avx2_short_hash_remainder
+L$seal_avx2_short_loop:
+	cmpq	$32,%rbx
+	jb	L$seal_avx2_short_tail
+	subq	$32,%rbx
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	L$seal_avx2_short_loop
+L$seal_avx2_short_tail:
+	cmpq	$16,%rbx
+	jb	L$seal_avx2_exit
+	subq	$16,%rbx
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm0
+L$seal_avx2_exit:
+	vzeroupper
+	jmp	L$seal_sse_tail_16
+
+
+#endif
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S
new file mode 100644
index 0000000..ac38f8f
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -0,0 +1,8918 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+.section	.rodata
+.align	64
+.Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long	0,0,0,0
+.Lsse_inc:
+.long	1,0,0,0
+.Lavx2_inc:
+.long	2,0,0,0,2,0,0,0
+.Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align	16
+.Land_masks:
+.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text	
+
+.type	poly_hash_ad_internal,@function
+.align	64
+poly_hash_ad_internal:
+.cfi_startproc	
+.cfi_def_cfa	rsp, 8
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	cmpq	$13,%r8
+	jne	.Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+
+	movq	(%rcx),%r10
+	movq	5(%rcx),%r11
+	shrq	$24,%r11
+	movq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	ret
+.Lhash_ad_loop:
+
+	cmpq	$16,%r8
+	jb	.Lhash_ad_tail
+	addq	0+0(%rcx),%r10
+	adcq	8+0(%rcx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rcx),%rcx
+	subq	$16,%r8
+	jmp	.Lhash_ad_loop
+.Lhash_ad_tail:
+	cmpq	$0,%r8
+	je	.Lhash_ad_done
+
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	addq	%r8,%rcx
+.Lhash_ad_tail_loop:
+	shldq	$8,%r13,%r14
+	shlq	$8,%r13
+	movzbq	-1(%rcx),%r15
+	xorq	%r15,%r13
+	decq	%rcx
+	decq	%r8
+	jne	.Lhash_ad_tail_loop
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lhash_ad_done:
+	ret
+.cfi_endproc	
+.size	poly_hash_ad_internal, .-poly_hash_ad_internal
+
+.globl	chacha20_poly1305_open
+.hidden chacha20_poly1305_open
+.type	chacha20_poly1305_open,@function
+.align	64
+chacha20_poly1305_open:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_open_avx2
+
+	cmpq	$128,%rbx
+	jbe	.Lopen_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movq	$10,%r10
+.Lopen_sse_init_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jne	.Lopen_sse_init_rounds
+
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_main_loop:
+	cmpq	$256,%rbx
+	jb	.Lopen_sse_tail
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+
+
+	movq	$4,%rcx
+	movq	%rsi,%r8
+.Lopen_sse_main_loop_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+
+	leaq	16(%r8),%r8
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%rcx
+	jge	.Lopen_sse_main_loop_rounds
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	cmpq	$-6,%rcx
+	jg	.Lopen_sse_main_loop_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	0+80(%rbp),%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_sse_main_loop
+.Lopen_sse_tail:
+
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+	cmpq	$192,%rbx
+	ja	.Lopen_sse_tail_256
+	cmpq	$128,%rbx
+	ja	.Lopen_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lopen_sse_tail_128
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	cmpq	$16,%rcx
+	jb	.Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+.Lopen_sse_tail_64_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	cmpq	$16,%rcx
+	jae	.Lopen_sse_tail_64_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_64_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_128_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+	movq	%rbx,%rcx
+	movq	$160,%r8
+	cmpq	$160,%rcx
+	cmovgq	%r8,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_192_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_192_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_192_rounds
+	cmpq	$176,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+160(%rsi),%r10
+	adcq	8+160(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	cmpq	$192,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+176(%rsi),%r10
+	adcq	8+176(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_finish:
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_256:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+	xorq	%r8,%r8
+.Lopen_sse_tail_256_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	movdqa	0+80(%rbp),%xmm11
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+	movdqa	0+80(%rbp),%xmm9
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+	movdqa	0+80(%rbp),%xmm11
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+	movdqa	0+80(%rbp),%xmm9
+
+	addq	$16,%r8
+	cmpq	$160,%r8
+	jb	.Lopen_sse_tail_256_rounds_and_x1hash
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+.Lopen_sse_tail_256_hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%r8
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_256_hash
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movdqa	0+80(%rbp),%xmm12
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	leaq	192(%rdi),%rdi
+
+
+.Lopen_sse_tail_64_dec_loop:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16_init
+	subq	$16,%rbx
+	movdqu	(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	jmp	.Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+	movdqa	%xmm0,%xmm1
+
+
+.Lopen_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+
+
+
+	pxor	%xmm3,%xmm3
+	leaq	-1(%rsi,%rbx,1),%rsi
+	movq	%rbx,%r8
+.Lopen_sse_tail_16_compose:
+	pslldq	$1,%xmm3
+	pinsrb	$0,(%rsi),%xmm3
+	subq	$1,%rsi
+	subq	$1,%r8
+	jnz	.Lopen_sse_tail_16_compose
+
+.byte	102,73,15,126,221
+	pextrq	$1,%xmm3,%r14
+
+	pxor	%xmm1,%xmm3
+
+
+.Lopen_sse_tail_16_extract:
+	pextrb	$0,%xmm3,(%rdi)
+	psrldq	$1,%xmm3
+	addq	$1,%rdi
+	subq	$1,%rbx
+	jne	.Lopen_sse_tail_16_extract
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lopen_sse_finalize:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	ret
+
+.Lopen_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm13,%xmm15
+	movq	$10,%r10
+
+.Lopen_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lopen_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm9
+	paddd	%xmm11,%xmm10
+	paddd	%xmm15,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm14
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm1
+	movdqu	%xmm1,0(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,%xmm13
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm10
+	jmp	.Lopen_sse_128_xor_hash
+.size	chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc	
+
+
+
+
+
+
+
+.globl	chacha20_poly1305_seal
+.hidden chacha20_poly1305_seal
+.type	chacha20_poly1305_seal,@function
+.align	64
+chacha20_poly1305_seal:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_seal_avx2
+
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm12
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+	movq	$10,%r10
+.Lseal_sse_init_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jnz	.Lseal_sse_init_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+
+	pand	.Lclamp(%rip),%xmm3
+	movdqa	%xmm3,0+0(%rbp)
+	movdqa	%xmm7,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	cmpq	$192,%rbx
+	ja	.Lseal_sse_main_init
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 128(%rdi)
+	movdqu	%xmm4,16 + 128(%rdi)
+	movdqu	%xmm8,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	movq	$2,%rcx
+	movq	$8,%r8
+	cmpq	$64,%rbx
+	jbe	.Lseal_sse_tail_64
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_tail_128
+	cmpq	$192,%rbx
+	jbe	.Lseal_sse_tail_192
+
+.Lseal_sse_main_loop:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+.align	32
+.Lseal_sse_main_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	leaq	16(%rdi),%rdi
+	decq	%r8
+	jge	.Lseal_sse_main_rounds
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_main_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	movdqa	%xmm14,0+80(%rbp)
+	movdqa	%xmm14,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm14
+	pxor	%xmm3,%xmm14
+	movdqu	%xmm14,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm14
+	pxor	%xmm7,%xmm14
+	movdqu	%xmm14,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm14
+	pxor	%xmm11,%xmm14
+	movdqu	%xmm14,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm14
+	pxor	%xmm15,%xmm14
+	movdqu	%xmm14,48 + 0(%rdi)
+
+	movdqa	0+80(%rbp),%xmm14
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	cmpq	$256,%rbx
+	ja	.Lseal_sse_main_loop_xor
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor:
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	movq	$6,%rcx
+	movq	$4,%r8
+	cmpq	$192,%rbx
+	jg	.Lseal_sse_main_loop
+	movq	%rbx,%rcx
+	testq	%rbx,%rbx
+	je	.Lseal_sse_128_tail_hash
+	movq	$6,%rcx
+	cmpq	$128,%rbx
+	ja	.Lseal_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lseal_sse_tail_128
+
+.Lseal_sse_tail_64:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+.Lseal_sse_tail_64_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_64_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_64_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_64_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+.Lseal_sse_tail_128_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_128_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_128_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_128_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	movq	$64,%rcx
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+.Lseal_sse_tail_192_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_192_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_192_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_192_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+
+.Lseal_sse_128_tail_hash:
+	cmpq	$16,%rcx
+	jb	.Lseal_sse_128_tail_xor
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	leaq	16(%rdi),%rdi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+	cmpq	$16,%rbx
+	jb	.Lseal_sse_tail_16
+	subq	$16,%rbx
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,0(%rdi)
+
+	addq	0(%rdi),%r10
+	adcq	8(%rdi),%r11
+	adcq	$1,%r12
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lprocess_blocks_of_extra_in
+
+	movq	%rbx,%r8
+	movq	%rbx,%rcx
+	leaq	-1(%rsi,%rbx,1),%rsi
+	pxor	%xmm15,%xmm15
+.Lseal_sse_tail_16_compose:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	decq	%rcx
+	jne	.Lseal_sse_tail_16_compose
+
+
+	pxor	%xmm0,%xmm15
+
+
+	movq	%rbx,%rcx
+	movdqu	%xmm15,%xmm0
+.Lseal_sse_tail_16_extract:
+	pextrb	$0,%xmm0,(%rdi)
+	psrldq	$1,%xmm0
+	addq	$1,%rdi
+	subq	$1,%rcx
+	jnz	.Lseal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	movq	288 + 0 + 32(%rsp),%r9
+	movq	56(%r9),%r14
+	movq	48(%r9),%r13
+	testq	%r14,%r14
+	jz	.Lprocess_partial_block
+
+	movq	$16,%r15
+	subq	%rbx,%r15
+	cmpq	%r15,%r14
+
+	jge	.Lload_extra_in
+	movq	%r14,%r15
+
+.Lload_extra_in:
+
+
+	leaq	-1(%r13,%r15,1),%rsi
+
+
+	addq	%r15,%r13
+	subq	%r15,%r14
+	movq	%r13,48(%r9)
+	movq	%r14,56(%r9)
+
+
+
+	addq	%r15,%r8
+
+
+	pxor	%xmm11,%xmm11
+.Lload_extra_load_loop:
+	pslldq	$1,%xmm11
+	pinsrb	$0,(%rsi),%xmm11
+	leaq	-1(%rsi),%rsi
+	subq	$1,%r15
+	jnz	.Lload_extra_load_loop
+
+
+
+
+	movq	%rbx,%r15
+
+.Lload_extra_shift_loop:
+	pslldq	$1,%xmm11
+	subq	$1,%r15
+	jnz	.Lload_extra_shift_loop
+
+
+
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+
+
+	por	%xmm11,%xmm15
+
+
+
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lprocess_blocks_of_extra_in:
+
+	movq	288+32+0 (%rsp),%r9
+	movq	48(%r9),%rsi
+	movq	56(%r9),%r8
+	movq	%r8,%rcx
+	shrq	$4,%r8
+
+.Lprocess_extra_hash_loop:
+	jz	process_extra_in_trailer
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rsi),%rsi
+	subq	$1,%r8
+	jmp	.Lprocess_extra_hash_loop
+process_extra_in_trailer:
+	andq	$15,%rcx
+	movq	%rcx,%rbx
+	jz	.Ldo_length_block
+	leaq	-1(%rsi,%rcx,1),%rsi
+
+.Lprocess_extra_in_trailer_load:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	subq	$1,%rcx
+	jnz	.Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Ldo_length_block:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	ret
+
+.Lseal_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm14
+	movdqa	%xmm14,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	movq	$10,%r10
+
+.Lseal_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lseal_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm8
+	paddd	%xmm11,%xmm9
+	paddd	%xmm15,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm13
+
+	pand	.Lclamp(%rip),%xmm2
+	movdqa	%xmm2,0+0(%rbp)
+	movdqa	%xmm6,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	jmp	.Lseal_sse_128_tail_xor
+.size	chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.cfi_endproc	
+
+
+.type	chacha20_poly1305_open_avx2,@function
+.align	64
+chacha20_poly1305_open_avx2:
+.cfi_startproc	
+
+
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+.cfi_adjust_cfa_offset	288 + 32
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lopen_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lopen_avx2_320
+
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_init_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	decq	%r10
+	jne	.Lopen_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_init_hash:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%rcx
+	cmpq	$64,%rcx
+	jne	.Lopen_avx2_init_hash
+
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm4,32(%rdi)
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	subq	$64,%rbx
+.Lopen_avx2_main_loop:
+
+	cmpq	$512,%rbx
+	jb	.Lopen_avx2_main_loop_done
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_main_loop_rounds:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rsi,%rcx,1),%r10
+	adcq	8+16(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rsi,%rcx,1),%r10
+	adcq	8+32(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rcx),%rcx
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	cmpq	$60*8,%rcx
+	jne	.Lopen_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+60*8(%rsi),%r10
+	adcq	8+60*8(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	addq	0+60*8+16(%rsi),%r10
+	adcq	8+60*8+16(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	leaq	512(%rdi),%rdi
+	subq	$512,%rbx
+	jmp	.Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+	testq	%rbx,%rbx
+	vzeroupper
+	je	.Lopen_sse_finalize
+
+	cmpq	$384,%rbx
+	ja	.Lopen_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lopen_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lopen_avx2_tail_256
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_128_rounds
+.Lopen_avx2_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_avx2_tail_128_rounds:
+	addq	$16,%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_avx2_tail_128_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$128,%rcx
+	shrq	$4,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_256_rounds_and_x1hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_256_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+
+	incq	%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_256_rounds_and_x1hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_256_rounds
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_tail_256_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_tail_256_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	subq	$128,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$256,%rcx
+	shrq	$4,%rcx
+	addq	$6,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_384_rounds_and_x2hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+	incq	%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_384_rounds_and_x2hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_384_rounds_and_x1hash
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_384_tail_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_384_tail_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+	movq	%rsi,%r8
+.Lopen_avx2_tail_512_rounds_and_x2hash:
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+.Lopen_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+16(%r8),%r10
+	adcq	8+16(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%r8),%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	incq	%rcx
+	cmpq	$4,%rcx
+	jl	.Lopen_avx2_tail_512_rounds_and_x2hash
+	cmpq	$10,%rcx
+	jne	.Lopen_avx2_tail_512_rounds_and_x1hash
+	movq	%rbx,%rcx
+	subq	$384,%rcx
+	andq	$-16,%rcx
+.Lopen_avx2_tail_512_hash:
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_512_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	subq	$16,%rcx
+	jmp	.Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	384(%rsi),%rsi
+	leaq	384(%rdi),%rdi
+	subq	$384,%rbx
+.Lopen_avx2_tail_128_xor:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_tail_32_xor
+	subq	$32,%rbx
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	jmp	.Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_exit
+	subq	$16,%rbx
+
+	vpxor	(%rsi),%xmm0,%xmm1
+	vmovdqu	%xmm1,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
+	vmovdqa	%xmm0,%xmm1
+.Lopen_avx2_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lopen_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lopen_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lopen_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_short_tail_32
+	subq	$32,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rsi),%r10
+	adcq	8+16(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_short_tail_32_exit
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm1
+.Lopen_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lopen_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lopen_avx2_short
+.size	chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc	
+
+
+.type	chacha20_poly1305_seal_avx2,@function
+.align	64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc	
+
+
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+.cfi_adjust_cfa_offset	288 + 32
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lseal_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lseal_avx2_320
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,%ymm15
+	vpaddd	.Lavx2_inc(%rip),%ymm15,%ymm14
+	vpaddd	.Lavx2_inc(%rip),%ymm14,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm15,0+256(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_init_rounds:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%r10
+	jnz	.Lseal_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
+	vpand	.Lclamp(%rip),%ymm15,%ymm15
+	vmovdqa	%ymm15,0+0(%rbp)
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+64(%rsi),%ymm15,%ymm15
+	vpxor	32+64(%rsi),%ymm2,%ymm2
+	vpxor	64+64(%rsi),%ymm6,%ymm6
+	vpxor	96+64(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm15,0+64(%rdi)
+	vmovdqu	%ymm2,32+64(%rdi)
+	vmovdqu	%ymm6,64+64(%rdi)
+	vmovdqu	%ymm10,96+64(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+192(%rsi),%ymm15,%ymm15
+	vpxor	32+192(%rsi),%ymm1,%ymm1
+	vpxor	64+192(%rsi),%ymm5,%ymm5
+	vpxor	96+192(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm15,0+192(%rdi)
+	vmovdqu	%ymm1,32+192(%rdi)
+	vmovdqu	%ymm5,64+192(%rdi)
+	vmovdqu	%ymm9,96+192(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm15,%ymm8
+
+	leaq	320(%rsi),%rsi
+	subq	$320,%rbx
+	movq	$320,%rcx
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_short_hash_remainder
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+	vpxor	64(%rsi),%ymm8,%ymm8
+	vpxor	96(%rsi),%ymm12,%ymm12
+	vmovdqu	%ymm0,320(%rdi)
+	vmovdqu	%ymm4,352(%rdi)
+	vmovdqu	%ymm8,384(%rdi)
+	vmovdqu	%ymm12,416(%rdi)
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	movq	$8,%rcx
+	movq	$2,%r8
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_tail_128
+	cmpq	$256,%rbx
+	jbe	.Lseal_avx2_tail_256
+	cmpq	$384,%rbx
+	jbe	.Lseal_avx2_tail_384
+	cmpq	$512,%rbx
+	jbe	.Lseal_avx2_tail_512
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+
+	subq	$16,%rdi
+	movq	$9,%rcx
+	jmp	.Lseal_avx2_main_loop_rounds_entry
+.align	32
+.Lseal_avx2_main_loop:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	movq	$10,%rcx
+.align	32
+.Lseal_avx2_main_loop_rounds:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lseal_avx2_main_loop_rounds_entry:
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rdi),%r10
+	adcq	8+32(%rdi),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rdi),%rdi
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%rcx
+	jne	.Lseal_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	subq	$512,%rbx
+	cmpq	$512,%rbx
+	jg	.Lseal_avx2_main_loop
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	movq	$10,%rcx
+	xorq	%r8,%r8
+
+	cmpq	$384,%rbx
+	ja	.Lseal_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lseal_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lseal_avx2_tail_256
+
+.Lseal_avx2_tail_128:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_128_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_128_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lseal_avx2_short_loop
+
+.Lseal_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+.Lseal_avx2_tail_256_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_256_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$128,%rcx
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+.Lseal_avx2_tail_384_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_384_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$256,%rcx
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_512_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_512_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$384,%rcx
+	leaq	384(%rsi),%rsi
+	subq	$384,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lseal_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lseal_avx2_short
+
+.Lseal_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lseal_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lseal_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lseal_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	xorq	%rcx,%rcx
+.Lseal_avx2_short_hash_remainder:
+	cmpq	$16,%rcx
+	jb	.Lseal_avx2_short_loop
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	addq	$16,%rdi
+	jmp	.Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+	cmpq	$32,%rbx
+	jb	.Lseal_avx2_short_tail
+	subq	$32,%rbx
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+	cmpq	$16,%rbx
+	jb	.Lseal_avx2_exit
+	subq	$16,%rbx
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm0
+.Lseal_avx2_exit:
+	vzeroupper
+	jmp	.Lseal_sse_tail_16
+.cfi_endproc	
+.size	chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+#endif
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm
new file mode 100644
index 0000000..095689c
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -0,0 +1,8957 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+EXTERN	OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+section	.rdata rdata align=8
+ALIGN	64
+$L$chacha20_consts:
+	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+$L$rol8:
+	DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+	DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+$L$rol16:
+	DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+	DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+$L$avx2_init:
+	DD	0,0,0,0
+$L$sse_inc:
+	DD	1,0,0,0
+$L$avx2_inc:
+	DD	2,0,0,0,2,0,0,0
+$L$clamp:
+	DQ	0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC
+	DQ	0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF
+ALIGN	16
+$L$and_masks:
+	DB	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+section	.text
+
+
+
+ALIGN	64
+poly_hash_ad_internal:
+
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	cmp	r8,13
+	jne	NEAR $L$hash_ad_loop
+$L$poly_fast_tls_ad:
+
+	mov	r10,QWORD[rcx]
+	mov	r11,QWORD[5+rcx]
+	shr	r11,24
+	mov	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	ret
+$L$hash_ad_loop:
+
+	cmp	r8,16
+	jb	NEAR $L$hash_ad_tail
+	add	r10,QWORD[((0+0))+rcx]
+	adc	r11,QWORD[((8+0))+rcx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rcx,[16+rcx]
+	sub	r8,16
+	jmp	NEAR $L$hash_ad_loop
+$L$hash_ad_tail:
+	cmp	r8,0
+	je	NEAR $L$hash_ad_done
+
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	add	rcx,r8
+$L$hash_ad_tail_loop:
+	shld	r14,r13,8
+	shl	r13,8
+	movzx	r15,BYTE[((-1))+rcx]
+	xor	r13,r15
+	dec	rcx
+	dec	r8
+	jne	NEAR $L$hash_ad_tail_loop
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$hash_ad_done:
+	ret
+
+
+
+global	chacha20_poly1305_open
+
+ALIGN	64
+chacha20_poly1305_open:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_open:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+
+	mov	eax,DWORD[((OPENSSL_ia32cap_P+8))]
+	and	eax,288
+	xor	eax,288
+	jz	NEAR chacha20_poly1305_open_avx2
+
+	cmp	rbx,128
+	jbe	NEAR $L$open_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm7,xmm12
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	mov	r10,10
+$L$open_sse_init_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jne	NEAR $L$open_sse_init_rounds
+
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_main_loop:
+	cmp	rbx,16*16
+	jb	NEAR $L$open_sse_tail
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+
+
+	mov	rcx,4
+	mov	r8,rsi
+$L$open_sse_main_loop_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+
+	lea	r8,[16+r8]
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	rcx
+	jge	NEAR $L$open_sse_main_loop_rounds
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	cmp	rcx,-6
+	jg	NEAR $L$open_sse_main_loop_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,XMMWORD[((160+80))+rbp]
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,16*16
+	jmp	NEAR $L$open_sse_main_loop
+$L$open_sse_tail:
+
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+	cmp	rbx,12*16
+	ja	NEAR $L$open_sse_tail_256
+	cmp	rbx,8*16
+	ja	NEAR $L$open_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$open_sse_tail_128
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	cmp	rcx,16
+	jb	NEAR $L$open_sse_tail_64_rounds
+$L$open_sse_tail_64_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+$L$open_sse_tail_64_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	cmp	rcx,16
+	jae	NEAR $L$open_sse_tail_64_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_64_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+	mov	rcx,rbx
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_128_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_128_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_128_rounds
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+	mov	rcx,rbx
+	mov	r8,10*16
+	cmp	rcx,10*16
+	cmovg	rcx,r8
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_192_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_192_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_192_rounds
+	cmp	rbx,11*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+160))+rsi]
+	adc	r11,QWORD[((8+160))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	cmp	rbx,12*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+176))+rsi]
+	adc	r11,QWORD[((8+176))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_finish:
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_256:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+	xor	r8,r8
+$L$open_sse_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+
+	add	r8,16
+	cmp	r8,10*16
+	jb	NEAR $L$open_sse_tail_256_rounds_and_x1hash
+
+	mov	rcx,rbx
+	and	rcx,-16
+$L$open_sse_tail_256_hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	r8,16
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_256_hash
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	movdqa	xmm12,XMMWORD[((160+80))+rbp]
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	lea	rdi,[192+rdi]
+
+
+$L$open_sse_tail_64_dec_loop:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16_init
+	sub	rbx,16
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+$L$open_sse_tail_16_init:
+	movdqa	xmm1,xmm0
+
+
+$L$open_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+
+
+
+	pxor	xmm3,xmm3
+	lea	rsi,[((-1))+rbx*1+rsi]
+	mov	r8,rbx
+$L$open_sse_tail_16_compose:
+	pslldq	xmm3,1
+	pinsrb	xmm3,BYTE[rsi],0
+	sub	rsi,1
+	sub	r8,1
+	jnz	NEAR $L$open_sse_tail_16_compose
+
+DB	102,73,15,126,221
+	pextrq	r14,xmm3,1
+
+	pxor	xmm3,xmm1
+
+
+$L$open_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm3,0
+	psrldq	xmm3,1
+	add	rdi,1
+	sub	rbx,1
+	jne	NEAR $L$open_sse_tail_16_extract
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$open_sse_finalize:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$open_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm12,XMMWORD[32+r9]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm13
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm13
+	mov	r10,10
+
+$L$open_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$open_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm9,xmm11
+	paddd	xmm10,xmm11
+	paddd	xmm13,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm14,xmm15
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_128_xor_hash:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm1,xmm3
+	movdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	movdqa	xmm13,xmm2
+	movdqa	xmm2,xmm6
+	movdqa	xmm6,xmm10
+	movdqa	xmm10,xmm14
+	jmp	NEAR $L$open_sse_128_xor_hash
+$L$SEH_end_chacha20_poly1305_open:
+
+
+
+
+
+
+
+
+global	chacha20_poly1305_seal
+
+ALIGN	64
+chacha20_poly1305_seal:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_seal:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,QWORD[56+r9]
+	add	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+	mov	rbx,rdx
+
+	mov	eax,DWORD[((OPENSSL_ia32cap_P+8))]
+	and	eax,288
+	xor	eax,288
+	jz	NEAR chacha20_poly1305_seal_avx2
+
+	cmp	rbx,128
+	jbe	NEAR $L$seal_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+	mov	r10,10
+$L$seal_sse_init_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_init_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+
+	pand	xmm3,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm3
+	movdqa	XMMWORD[(160+16)+rbp],xmm7
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	cmp	rbx,12*16
+	ja	NEAR $L$seal_sse_main_init
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_init:
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	mov	rcx,2
+	mov	r8,8
+	cmp	rbx,4*16
+	jbe	NEAR $L$seal_sse_tail_64
+	cmp	rbx,8*16
+	jbe	NEAR $L$seal_sse_tail_128
+	cmp	rbx,12*16
+	jbe	NEAR $L$seal_sse_tail_192
+
+$L$seal_sse_main_loop:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+ALIGN	32
+$L$seal_sse_main_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	lea	rdi,[16+rdi]
+	dec	r8
+	jge	NEAR $L$seal_sse_main_rounds
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_main_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqu	xmm14,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm14,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm14,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm14,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm14,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm14
+
+	movdqa	xmm14,XMMWORD[((160+80))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	cmp	rbx,16*16
+	ja	NEAR $L$seal_sse_main_loop_xor
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_loop_xor:
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	sub	rbx,16*16
+	mov	rcx,6
+	mov	r8,4
+	cmp	rbx,12*16
+	jg	NEAR $L$seal_sse_main_loop
+	mov	rcx,rbx
+	test	rbx,rbx
+	je	NEAR $L$seal_sse_128_tail_hash
+	mov	rcx,6
+	cmp	rbx,8*16
+	ja	NEAR $L$seal_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$seal_sse_tail_128
+
+$L$seal_sse_tail_64:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+$L$seal_sse_tail_64_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_64_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_64_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_64_rounds_and_x1hash
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+$L$seal_sse_tail_128_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_128_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_128_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_128_rounds_and_x1hash
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	mov	rcx,4*16
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+$L$seal_sse_tail_192_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_192_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_192_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_192_rounds_and_x1hash
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+
+$L$seal_sse_128_tail_hash:
+	cmp	rcx,16
+	jb	NEAR $L$seal_sse_128_tail_xor
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	lea	rdi,[16+rdi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_128_tail_xor:
+	cmp	rbx,16
+	jb	NEAR $L$seal_sse_tail_16
+	sub	rbx,16
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+
+	add	r10,QWORD[rdi]
+	adc	r11,QWORD[8+rdi]
+	adc	r12,1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	movdqa	xmm12,xmm1
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$process_blocks_of_extra_in
+
+	mov	r8,rbx
+	mov	rcx,rbx
+	lea	rsi,[((-1))+rbx*1+rsi]
+	pxor	xmm15,xmm15
+$L$seal_sse_tail_16_compose:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	dec	rcx
+	jne	NEAR $L$seal_sse_tail_16_compose
+
+
+	pxor	xmm15,xmm0
+
+
+	mov	rcx,rbx
+	movdqu	xmm0,xmm15
+$L$seal_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm0,0
+	psrldq	xmm0,1
+	add	rdi,1
+	sub	rcx,1
+	jnz	NEAR $L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	mov	r9,QWORD[((288 + 160 + 32))+rsp]
+	mov	r14,QWORD[56+r9]
+	mov	r13,QWORD[48+r9]
+	test	r14,r14
+	jz	NEAR $L$process_partial_block
+
+	mov	r15,16
+	sub	r15,rbx
+	cmp	r14,r15
+
+	jge	NEAR $L$load_extra_in
+	mov	r15,r14
+
+$L$load_extra_in:
+
+
+	lea	rsi,[((-1))+r15*1+r13]
+
+
+	add	r13,r15
+	sub	r14,r15
+	mov	QWORD[48+r9],r13
+	mov	QWORD[56+r9],r14
+
+
+
+	add	r8,r15
+
+
+	pxor	xmm11,xmm11
+$L$load_extra_load_loop:
+	pslldq	xmm11,1
+	pinsrb	xmm11,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	r15,1
+	jnz	NEAR $L$load_extra_load_loop
+
+
+
+
+	mov	r15,rbx
+
+$L$load_extra_shift_loop:
+	pslldq	xmm11,1
+	sub	r15,1
+	jnz	NEAR $L$load_extra_shift_loop
+
+
+
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+
+
+	por	xmm15,xmm11
+
+
+
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$process_blocks_of_extra_in:
+
+	mov	r9,QWORD[((288+32+160 ))+rsp]
+	mov	rsi,QWORD[48+r9]
+	mov	r8,QWORD[56+r9]
+	mov	rcx,r8
+	shr	r8,4
+
+$L$process_extra_hash_loop:
+	jz	NEAR process_extra_in_trailer
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rsi,[16+rsi]
+	sub	r8,1
+	jmp	NEAR $L$process_extra_hash_loop
+process_extra_in_trailer:
+	and	rcx,15
+	mov	rbx,rcx
+	jz	NEAR $L$do_length_block
+	lea	rsi,[((-1))+rcx*1+rsi]
+
+$L$process_extra_in_trailer_load:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	rcx,1
+	jnz	NEAR $L$process_extra_in_trailer_load
+
+$L$process_partial_block:
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$do_length_block:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	ret
+
+$L$seal_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm14,XMMWORD[32+r9]
+	movdqa	xmm12,xmm14
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	mov	r10,10
+
+$L$seal_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm8,xmm11
+	paddd	xmm9,xmm11
+	paddd	xmm12,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm13,xmm15
+
+	pand	xmm2,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm2
+	movdqa	XMMWORD[(160+16)+rbp],xmm6
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	jmp	NEAR $L$seal_sse_128_tail_xor
+$L$SEH_end_chacha20_poly1305_seal:
+
+
+
+
+ALIGN	64
+chacha20_poly1305_open_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$open_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$open_avx2_320
+
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	mov	r10,10
+$L$open_avx2_init_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_init_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	xor	rcx,rcx
+$L$open_avx2_init_hash:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	rcx,16
+	cmp	rcx,2*32
+	jne	NEAR $L$open_avx2_init_hash
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+
+	vmovdqu	YMMWORD[rdi],ymm0
+	vmovdqu	YMMWORD[32+rdi],ymm4
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	sub	rbx,2*32
+$L$open_avx2_main_loop:
+
+	cmp	rbx,16*32
+	jb	NEAR $L$open_avx2_main_loop_done
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+$L$open_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rcx*1+rsi]
+	adc	r11,QWORD[((8+16))+rcx*1+rsi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rcx*1+rsi]
+	adc	r11,QWORD[((8+32))+rcx*1+rsi]
+	adc	r12,1
+
+	lea	rcx,[48+rcx]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	cmp	rcx,10*6*8
+	jne	NEAR $L$open_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+480))+rsi]
+	adc	r11,QWORD[((8+480))+rsi]
+	adc	r12,1
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	add	r10,QWORD[((0+480+16))+rsi]
+	adc	r11,QWORD[((8+480+16))+rsi]
+	adc	r12,1
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	lea	rdi,[512+rdi]
+	sub	rbx,16*32
+	jmp	NEAR $L$open_avx2_main_loop
+$L$open_avx2_main_loop_done:
+	test	rbx,rbx
+	vzeroupper
+	je	NEAR $L$open_sse_finalize
+
+	cmp	rbx,12*32
+	ja	NEAR $L$open_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$open_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$open_avx2_tail_256
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	and	rcx,-16
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_128_rounds
+$L$open_avx2_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_avx2_tail_128_rounds:
+	add	r8,16
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_128_rounds_and_x1hash
+	cmp	r8,160
+	jne	NEAR $L$open_avx2_tail_128_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,4*32
+	shr	rcx,4
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_256_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+
+	inc	r8
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_256_rounds_and_x1hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_256_rounds
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_tail_256_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_tail_256_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_tail_256_hash
+$L$open_avx2_tail_256_done:
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	sub	rbx,4*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,8*32
+	shr	rcx,4
+	add	rcx,6
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_384_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+	inc	r8
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_384_rounds_and_x2hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_384_rounds_and_x1hash
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_384_tail_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_384_tail_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_384_tail_hash
+$L$open_avx2_384_tail_done:
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,8*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+	mov	r8,rsi
+$L$open_avx2_tail_512_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+$L$open_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+16))+r8]
+	adc	r11,QWORD[((8+16))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[32+r8]
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	inc	rcx
+	cmp	rcx,4
+	jl	NEAR $L$open_avx2_tail_512_rounds_and_x2hash
+	cmp	rcx,10
+	jne	NEAR $L$open_avx2_tail_512_rounds_and_x1hash
+	mov	rcx,rbx
+	sub	rcx,12*32
+	and	rcx,-16
+$L$open_avx2_tail_512_hash:
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_512_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	sub	rcx,2*8
+	jmp	NEAR $L$open_avx2_tail_512_hash
+$L$open_avx2_tail_512_done:
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[384+rsi]
+	lea	rdi,[384+rdi]
+	sub	rbx,12*32
+$L$open_avx2_tail_128_xor:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_tail_32_xor
+	sub	rbx,32
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	jmp	NEAR $L$open_avx2_tail_128_xor
+$L$open_avx2_tail_32_xor:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_exit
+	sub	rbx,16
+
+	vpxor	xmm1,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vperm2i128	ymm0,ymm0,ymm0,0x11
+	vmovdqa	xmm1,xmm0
+$L$open_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$open_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$open_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_avx2_short_hash_and_xor_loop:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_short_tail_32
+	sub	rbx,32
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rsi]
+	adc	r11,QWORD[((8+16))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$open_avx2_short_hash_and_xor_loop
+$L$open_avx2_short_tail_32:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_short_tail_32_exit
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vextracti128	xmm1,ymm0,1
+$L$open_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$open_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$open_avx2_short
+
+
+
+
+
+ALIGN	64
+chacha20_poly1305_seal_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$seal_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$seal_avx2_320
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm7,ymm4
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	ymm15,ymm12
+	vpaddd	ymm14,ymm15,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm14,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	mov	r10,10
+$L$seal_avx2_init_rounds:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	r10
+	jnz	NEAR $L$seal_avx2_init_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vperm2i128	ymm15,ymm7,ymm3,0x02
+	vperm2i128	ymm3,ymm7,ymm3,0x13
+	vpand	ymm15,ymm15,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm15
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	vpxor	ymm3,ymm3,YMMWORD[rsi]
+	vpxor	ymm11,ymm11,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm3
+	vmovdqu	YMMWORD[32+rdi],ymm11
+	vperm2i128	ymm15,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+64))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+64))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+64))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+64))+rsi]
+	vmovdqu	YMMWORD[(0+64)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+64)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+64)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+64)+rdi],ymm10
+	vperm2i128	ymm15,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+192))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+192))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+192))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+192))+rsi]
+	vmovdqu	YMMWORD[(0+192)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+192)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+192)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+192)+rdi],ymm9
+	vperm2i128	ymm15,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm15
+
+	lea	rsi,[320+rsi]
+	sub	rbx,10*32
+	mov	rcx,10*32
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_short_hash_remainder
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[64+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[320+rdi],ymm0
+	vmovdqu	YMMWORD[352+rdi],ymm4
+	vmovdqu	YMMWORD[384+rdi],ymm8
+	vmovdqu	YMMWORD[416+rdi],ymm12
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	mov	rcx,8
+	mov	r8,2
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_tail_128
+	cmp	rbx,8*32
+	jbe	NEAR $L$seal_avx2_tail_256
+	cmp	rbx,12*32
+	jbe	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,16*32
+	jbe	NEAR $L$seal_avx2_tail_512
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+
+	sub	rdi,16
+	mov	rcx,9
+	jmp	NEAR $L$seal_avx2_main_loop_rounds_entry
+ALIGN	32
+$L$seal_avx2_main_loop:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	mov	rcx,10
+ALIGN	32
+$L$seal_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$seal_avx2_main_loop_rounds_entry:
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rdi]
+	adc	r11,QWORD[((8+32))+rdi]
+	adc	r12,1
+
+	lea	rdi,[48+rdi]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	rcx
+	jne	NEAR $L$seal_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	sub	rbx,16*32
+	cmp	rbx,16*32
+	jg	NEAR $L$seal_avx2_main_loop
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	mov	rcx,10
+	xor	r8,r8
+
+	cmp	rbx,12*32
+	ja	NEAR $L$seal_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$seal_avx2_tail_256
+
+$L$seal_avx2_tail_128:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_128_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_128_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$seal_avx2_short_loop
+
+$L$seal_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+$L$seal_avx2_tail_256_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_256_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,4*32
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+$L$seal_avx2_tail_384_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_384_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,8*32
+	lea	rsi,[256+rsi]
+	sub	rbx,8*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_512_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r15,rax
+	adc	r9,rdx
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	add	r15,rax
+	adc	r9,rdx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_512_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,12*32
+	lea	rsi,[384+rsi]
+	sub	rbx,12*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$seal_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$seal_avx2_short
+
+$L$seal_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$seal_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$seal_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	xor	rcx,rcx
+$L$seal_avx2_short_hash_remainder:
+	cmp	rcx,16
+	jb	NEAR $L$seal_avx2_short_loop
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	add	rdi,16
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+$L$seal_avx2_short_loop:
+	cmp	rbx,32
+	jb	NEAR $L$seal_avx2_short_tail
+	sub	rbx,32
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$seal_avx2_short_loop
+$L$seal_avx2_short_tail:
+	cmp	rbx,16
+	jb	NEAR $L$seal_avx2_exit
+	sub	rbx,16
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	vextracti128	xmm0,ymm0,1
+$L$seal_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$seal_sse_tail_16
+
+
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/err_data.c b/gen/crypto/err_data.c
new file mode 100644
index 0000000..898825a
--- /dev/null
+++ b/gen/crypto/err_data.c
@@ -0,0 +1,1512 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+ /* This file was generated by go run ./util/pregenerate. */
+
+#include <openssl/base.h>
+#include <openssl/err.h>
+
+#include <assert.h>
+
+static_assert(ERR_LIB_NONE == 1, "library value changed");
+static_assert(ERR_LIB_SYS == 2, "library value changed");
+static_assert(ERR_LIB_BN == 3, "library value changed");
+static_assert(ERR_LIB_RSA == 4, "library value changed");
+static_assert(ERR_LIB_DH == 5, "library value changed");
+static_assert(ERR_LIB_EVP == 6, "library value changed");
+static_assert(ERR_LIB_BUF == 7, "library value changed");
+static_assert(ERR_LIB_OBJ == 8, "library value changed");
+static_assert(ERR_LIB_PEM == 9, "library value changed");
+static_assert(ERR_LIB_DSA == 10, "library value changed");
+static_assert(ERR_LIB_X509 == 11, "library value changed");
+static_assert(ERR_LIB_ASN1 == 12, "library value changed");
+static_assert(ERR_LIB_CONF == 13, "library value changed");
+static_assert(ERR_LIB_CRYPTO == 14, "library value changed");
+static_assert(ERR_LIB_EC == 15, "library value changed");
+static_assert(ERR_LIB_SSL == 16, "library value changed");
+static_assert(ERR_LIB_BIO == 17, "library value changed");
+static_assert(ERR_LIB_PKCS7 == 18, "library value changed");
+static_assert(ERR_LIB_PKCS8 == 19, "library value changed");
+static_assert(ERR_LIB_X509V3 == 20, "library value changed");
+static_assert(ERR_LIB_RAND == 21, "library value changed");
+static_assert(ERR_LIB_ENGINE == 22, "library value changed");
+static_assert(ERR_LIB_OCSP == 23, "library value changed");
+static_assert(ERR_LIB_UI == 24, "library value changed");
+static_assert(ERR_LIB_COMP == 25, "library value changed");
+static_assert(ERR_LIB_ECDSA == 26, "library value changed");
+static_assert(ERR_LIB_ECDH == 27, "library value changed");
+static_assert(ERR_LIB_HMAC == 28, "library value changed");
+static_assert(ERR_LIB_DIGEST == 29, "library value changed");
+static_assert(ERR_LIB_CIPHER == 30, "library value changed");
+static_assert(ERR_LIB_HKDF == 31, "library value changed");
+static_assert(ERR_LIB_TRUST_TOKEN == 32, "library value changed");
+static_assert(ERR_LIB_USER == 33, "library value changed");
+static_assert(ERR_NUM_LIBS == 34, "number of libraries changed");
+
+const uint32_t kOpenSSLReasonValues[] = {
+    0xc320885,
+    0xc32889f,
+    0xc3308ae,
+    0xc3388be,
+    0xc3408cd,
+    0xc3488e6,
+    0xc3508f2,
+    0xc35890f,
+    0xc36092f,
+    0xc36893d,
+    0xc37094d,
+    0xc37895a,
+    0xc38096a,
+    0xc388975,
+    0xc39098b,
+    0xc39899a,
+    0xc3a09ae,
+    0xc3a8892,
+    0xc3b00f7,
+    0xc3b8921,
+    0x10320892,
+    0x10329641,
+    0x1033164d,
+    0x10339666,
+    0x10341679,
+    0x10348d4e,
+    0x10350cdf,
+    0x1035968c,
+    0x103616b6,
+    0x103696c9,
+    0x103716e8,
+    0x10379701,
+    0x10381716,
+    0x10389734,
+    0x10391743,
+    0x1039975f,
+    0x103a177a,
+    0x103a9789,
+    0x103b17a5,
+    0x103b97c0,
+    0x103c17e6,
+    0x103c80f7,
+    0x103d17f7,
+    0x103d980b,
+    0x103e182a,
+    0x103e9839,
+    0x103f1850,
+    0x103f9863,
+    0x10400ca3,
+    0x10409876,
+    0x10411894,
+    0x104198a7,
+    0x104218c1,
+    0x104298d1,
+    0x104318e5,
+    0x104398fb,
+    0x10441913,
+    0x10449928,
+    0x1045193c,
+    0x1045994e,
+    0x10460635,
+    0x1046899a,
+    0x10471963,
+    0x1047997a,
+    0x1048198f,
+    0x1048999d,
+    0x10490f57,
+    0x104997d7,
+    0x104a16a1,
+    0x14320c73,
+    0x14328c94,
+    0x14330ca3,
+    0x14338cb5,
+    0x143400b9,
+    0x143480f7,
+    0x14350c81,
+    0x18320090,
+    0x18328fe9,
+    0x183300b9,
+    0x18338fff,
+    0x18341013,
+    0x183480f7,
+    0x18351032,
+    0x1835904a,
+    0x1836105f,
+    0x18369073,
+    0x183710ab,
+    0x183790c1,
+    0x183810d5,
+    0x183890e5,
+    0x18390ac0,
+    0x183990f5,
+    0x183a111b,
+    0x183a9141,
+    0x183b0ceb,
+    0x183b9190,
+    0x183c11a2,
+    0x183c91ad,
+    0x183d11bd,
+    0x183d91ce,
+    0x183e11df,
+    0x183e91f1,
+    0x183f121a,
+    0x183f9233,
+    0x1840124b,
+    0x1840870d,
+    0x18411164,
+    0x1841912f,
+    0x1842114e,
+    0x18428c81,
+    0x1843110a,
+    0x18439176,
+    0x18441028,
+    0x18449097,
+    0x20321285,
+    0x20329272,
+    0x24321291,
+    0x243289e0,
+    0x243312a3,
+    0x243392b0,
+    0x243412bd,
+    0x243492cf,
+    0x243512de,
+    0x243592fb,
+    0x24361308,
+    0x24369316,
+    0x24371324,
+    0x24379332,
+    0x2438133b,
+    0x24389348,
+    0x2439135b,
+    0x28320cd3,
+    0x28328ceb,
+    0x28330ca3,
+    0x28338cfe,
+    0x28340cdf,
+    0x283480b9,
+    0x283500f7,
+    0x28358c81,
+    0x2836099a,
+    0x2c3232e7,
+    0x2c329372,
+    0x2c3332f5,
+    0x2c33b307,
+    0x2c34331b,
+    0x2c34b32d,
+    0x2c353348,
+    0x2c35b35a,
+    0x2c36338a,
+    0x2c36833a,
+    0x2c373397,
+    0x2c37b3c3,
+    0x2c383401,
+    0x2c38b418,
+    0x2c393436,
+    0x2c39b446,
+    0x2c3a3458,
+    0x2c3ab46c,
+    0x2c3b347d,
+    0x2c3bb49c,
+    0x2c3c1384,
+    0x2c3c939a,
+    0x2c3d34e1,
+    0x2c3d93b3,
+    0x2c3e350b,
+    0x2c3eb519,
+    0x2c3f3531,
+    0x2c3fb549,
+    0x2c403573,
+    0x2c409285,
+    0x2c413584,
+    0x2c41b597,
+    0x2c42124b,
+    0x2c42b5a8,
+    0x2c43076d,
+    0x2c43b48e,
+    0x2c4433d6,
+    0x2c44b556,
+    0x2c45336d,
+    0x2c45b3a9,
+    0x2c463426,
+    0x2c46b4b0,
+    0x2c4734c5,
+    0x2c47b4fe,
+    0x2c4833e8,
+    0x30320000,
+    0x30328015,
+    0x3033001f,
+    0x30338038,
+    0x30340057,
+    0x30348071,
+    0x30350078,
+    0x30358090,
+    0x303600a1,
+    0x303680b9,
+    0x303700c6,
+    0x303780d5,
+    0x303800f7,
+    0x30388104,
+    0x30390117,
+    0x30398132,
+    0x303a0147,
+    0x303a815b,
+    0x303b016f,
+    0x303b8180,
+    0x303c0199,
+    0x303c81b6,
+    0x303d01c4,
+    0x303d81d8,
+    0x303e01e8,
+    0x303e8201,
+    0x303f0211,
+    0x303f8224,
+    0x30400233,
+    0x3040823f,
+    0x30410254,
+    0x30418264,
+    0x3042027b,
+    0x30428288,
+    0x3043029b,
+    0x304382aa,
+    0x304402bf,
+    0x304482e0,
+    0x304502f3,
+    0x30458306,
+    0x3046031f,
+    0x3046833a,
+    0x30470372,
+    0x30478384,
+    0x304803a2,
+    0x304883b3,
+    0x304903c2,
+    0x304983da,
+    0x304a03ec,
+    0x304a8400,
+    0x304b0418,
+    0x304b842b,
+    0x304c0436,
+    0x304c8447,
+    0x304d0453,
+    0x304d8469,
+    0x304e0477,
+    0x304e848d,
+    0x304f049f,
+    0x304f84b1,
+    0x305004d4,
+    0x305084e7,
+    0x305104f8,
+    0x30518508,
+    0x30520520,
+    0x30528535,
+    0x3053054d,
+    0x30538561,
+    0x30540579,
+    0x30548592,
+    0x305505ab,
+    0x305585c8,
+    0x305605d3,
+    0x305685eb,
+    0x305705fb,
+    0x3057860c,
+    0x3058061f,
+    0x30588635,
+    0x3059063e,
+    0x30598653,
+    0x305a0666,
+    0x305a8675,
+    0x305b0695,
+    0x305b86a4,
+    0x305c06c5,
+    0x305c86e1,
+    0x305d06ed,
+    0x305d870d,
+    0x305e0729,
+    0x305e874d,
+    0x305f0763,
+    0x305f876d,
+    0x306004c4,
+    0x3060804a,
+    0x30610357,
+    0x3061873a,
+    0x30620392,
+    0x34320bb0,
+    0x34328bc4,
+    0x34330be1,
+    0x34338bf4,
+    0x34340c03,
+    0x34348c5d,
+    0x34350c41,
+    0x34358c20,
+    0x3c320090,
+    0x3c328da0,
+    0x3c330db9,
+    0x3c338dd4,
+    0x3c340df1,
+    0x3c348e1b,
+    0x3c350e36,
+    0x3c358e5c,
+    0x3c360e75,
+    0x3c368e8d,
+    0x3c370e9e,
+    0x3c378eac,
+    0x3c380eb9,
+    0x3c388ecd,
+    0x3c390ceb,
+    0x3c398ef0,
+    0x3c3a0f04,
+    0x3c3a895a,
+    0x3c3b0f14,
+    0x3c3b8f2f,
+    0x3c3c0f41,
+    0x3c3c8f74,
+    0x3c3d0f7e,
+    0x3c3d8f92,
+    0x3c3e0fa0,
+    0x3c3e8fc5,
+    0x3c3f0d8c,
+    0x3c3f8fae,
+    0x3c4000b9,
+    0x3c4080f7,
+    0x3c410e0c,
+    0x3c418e4b,
+    0x3c420f57,
+    0x3c428ee1,
+    0x40321a2f,
+    0x40329a45,
+    0x40331a73,
+    0x40339a7d,
+    0x40341a94,
+    0x40349ab2,
+    0x40351ac2,
+    0x40359ad4,
+    0x40361ae1,
+    0x40369aed,
+    0x40371b02,
+    0x40379b14,
+    0x40381b1f,
+    0x40389b31,
+    0x40390d4e,
+    0x40399b41,
+    0x403a1b54,
+    0x403a9b75,
+    0x403b1b86,
+    0x403b9b96,
+    0x403c0071,
+    0x403c8090,
+    0x403d1bf7,
+    0x403d9c0d,
+    0x403e1c1c,
+    0x403e9c54,
+    0x403f1c6e,
+    0x403f9c96,
+    0x40401cab,
+    0x40409cbf,
+    0x40411cfa,
+    0x40419d15,
+    0x40421d2e,
+    0x40429d41,
+    0x40431d55,
+    0x40439d83,
+    0x40441d9a,
+    0x404480b9,
+    0x40451daf,
+    0x40459dc1,
+    0x40461de5,
+    0x40469e05,
+    0x40471e13,
+    0x40479e3a,
+    0x40481eab,
+    0x40489f65,
+    0x40491f7c,
+    0x40499f96,
+    0x404a1fad,
+    0x404a9fcb,
+    0x404b1fe3,
+    0x404ba010,
+    0x404c2026,
+    0x404ca038,
+    0x404d2059,
+    0x404da092,
+    0x404e20a6,
+    0x404ea0b3,
+    0x404f2164,
+    0x404fa1da,
+    0x40502249,
+    0x4050a25d,
+    0x40512290,
+    0x405222a0,
+    0x4052a2c4,
+    0x405322dc,
+    0x4053a2ef,
+    0x40542304,
+    0x4054a327,
+    0x40552352,
+    0x4055a38f,
+    0x405623b4,
+    0x4056a3cd,
+    0x405723e5,
+    0x4057a3f8,
+    0x4058240d,
+    0x4058a434,
+    0x40592463,
+    0x4059a490,
+    0x405aa4a4,
+    0x405b24bc,
+    0x405ba4cd,
+    0x405c24e0,
+    0x405ca51f,
+    0x405d252c,
+    0x405da551,
+    0x405e258f,
+    0x405e8afe,
+    0x405f25b0,
+    0x405fa5bd,
+    0x406025cb,
+    0x4060a5ed,
+    0x4061264e,
+    0x4061a686,
+    0x4062269d,
+    0x4062a6ae,
+    0x406326fb,
+    0x4063a710,
+    0x40642727,
+    0x4064a753,
+    0x4065276e,
+    0x4065a785,
+    0x4066279d,
+    0x4066a7c7,
+    0x406727f2,
+    0x4067a837,
+    0x4068287f,
+    0x4068a8a0,
+    0x406928d2,
+    0x4069a900,
+    0x406a2921,
+    0x406aa941,
+    0x406b2ac9,
+    0x406baaec,
+    0x406c2b02,
+    0x406cae0c,
+    0x406d2e3b,
+    0x406dae63,
+    0x406e2e91,
+    0x406eaede,
+    0x406f2f37,
+    0x406faf6f,
+    0x40702f82,
+    0x4070af9f,
+    0x4071084d,
+    0x4071afb1,
+    0x40722fc4,
+    0x4072affa,
+    0x40733012,
+    0x4073959c,
+    0x40743026,
+    0x4074b040,
+    0x40753051,
+    0x4075b065,
+    0x40763073,
+    0x40769348,
+    0x40773098,
+    0x4077b0d8,
+    0x407830f3,
+    0x4078b12c,
+    0x40793143,
+    0x4079b159,
+    0x407a3185,
+    0x407ab198,
+    0x407b31ad,
+    0x407bb1bf,
+    0x407c31f0,
+    0x407cb1f9,
+    0x407d28bb,
+    0x407da202,
+    0x407e3108,
+    0x407ea444,
+    0x407f1e27,
+    0x407f9ffa,
+    0x40802174,
+    0x40809e4f,
+    0x408122b2,
+    0x4081a101,
+    0x40822e7c,
+    0x40829ba2,
+    0x4083241f,
+    0x4083a738,
+    0x40841e63,
+    0x4084a47c,
+    0x408524f1,
+    0x4085a615,
+    0x40862571,
+    0x4086a21c,
+    0x40872ec2,
+    0x4087a663,
+    0x40881be0,
+    0x4088a84a,
+    0x40891c2f,
+    0x40899bbc,
+    0x408a2b3a,
+    0x408a99b4,
+    0x408b31d4,
+    0x408baf4c,
+    0x408c2501,
+    0x408c99ec,
+    0x408d1f4b,
+    0x408d9e95,
+    0x408e207b,
+    0x408ea36f,
+    0x408f285e,
+    0x408fa631,
+    0x40902813,
+    0x4090a543,
+    0x40912b22,
+    0x40919a12,
+    0x40921c7c,
+    0x4092aefd,
+    0x40932fdd,
+    0x4093a22d,
+    0x40941e77,
+    0x4094ab53,
+    0x409526bf,
+    0x4095b165,
+    0x40962ea9,
+    0x4096a18d,
+    0x40972278,
+    0x4097a0ca,
+    0x40981cdc,
+    0x4098a6d3,
+    0x40992f19,
+    0x4099a39c,
+    0x409a2335,
+    0x409a99d0,
+    0x409b1ed1,
+    0x409b9efc,
+    0x409c30ba,
+    0x409c9f24,
+    0x409d2149,
+    0x409da117,
+    0x409e1d6d,
+    0x409ea1c2,
+    0x409f21aa,
+    0x409f9ec4,
+    0x40a021ea,
+    0x40a0a0e4,
+    0x40a12132,
+    0x41f429f4,
+    0x41f92a86,
+    0x41fe2979,
+    0x41feac2f,
+    0x41ff2d5d,
+    0x42032a0d,
+    0x42082a2f,
+    0x4208aa6b,
+    0x4209295d,
+    0x4209aaa5,
+    0x420a29b4,
+    0x420aa994,
+    0x420b29d4,
+    0x420baa4d,
+    0x420c2d79,
+    0x420cab63,
+    0x420d2c16,
+    0x420dac4d,
+    0x42122c80,
+    0x42172d40,
+    0x4217acc2,
+    0x421c2ce4,
+    0x421f2c9f,
+    0x42212df1,
+    0x42262d23,
+    0x422b2dcf,
+    0x422babf1,
+    0x422c2db1,
+    0x422caba4,
+    0x422d2b7d,
+    0x422dad90,
+    0x422e2bd0,
+    0x42302cff,
+    0x4230ac67,
+    0x44320778,
+    0x44328787,
+    0x44330793,
+    0x443387a1,
+    0x443407b4,
+    0x443487c5,
+    0x443507cc,
+    0x443587d6,
+    0x443607e9,
+    0x443687ff,
+    0x44370811,
+    0x4437881e,
+    0x4438082d,
+    0x44388835,
+    0x4439084d,
+    0x4439885b,
+    0x443a086e,
+    0x48321372,
+    0x48329384,
+    0x4833139a,
+    0x483393b3,
+    0x4c3213f0,
+    0x4c329400,
+    0x4c331413,
+    0x4c339433,
+    0x4c3400b9,
+    0x4c3480f7,
+    0x4c35143f,
+    0x4c35944d,
+    0x4c361469,
+    0x4c36948f,
+    0x4c37149e,
+    0x4c3794ac,
+    0x4c3814c1,
+    0x4c3894cd,
+    0x4c3914ed,
+    0x4c399517,
+    0x4c3a1530,
+    0x4c3a9549,
+    0x4c3b0635,
+    0x4c3b9562,
+    0x4c3c1574,
+    0x4c3c9583,
+    0x4c3d159c,
+    0x4c3d8cc6,
+    0x4c3e1609,
+    0x4c3e95ab,
+    0x4c3f162b,
+    0x4c3f9348,
+    0x4c4015c1,
+    0x4c4093dc,
+    0x4c4115f9,
+    0x4c41947c,
+    0x4c4215e5,
+    0x4c4293c4,
+    0x503235ba,
+    0x5032b5c9,
+    0x503335d4,
+    0x5033b5e4,
+    0x503435fd,
+    0x5034b617,
+    0x50353625,
+    0x5035b63b,
+    0x5036364d,
+    0x5036b663,
+    0x5037367c,
+    0x5037b68f,
+    0x503836a7,
+    0x5038b6b8,
+    0x503936cd,
+    0x5039b6e1,
+    0x503a3701,
+    0x503ab717,
+    0x503b372f,
+    0x503bb741,
+    0x503c375d,
+    0x503cb774,
+    0x503d378d,
+    0x503db7a3,
+    0x503e37b0,
+    0x503eb7c6,
+    0x503f37d8,
+    0x503f83b3,
+    0x504037eb,
+    0x5040b7fb,
+    0x50413815,
+    0x5041b824,
+    0x5042383e,
+    0x5042b85b,
+    0x5043386b,
+    0x5043b87b,
+    0x50443898,
+    0x50448469,
+    0x504538ac,
+    0x5045b8ca,
+    0x504638dd,
+    0x5046b8f3,
+    0x50473905,
+    0x5047b91a,
+    0x50483940,
+    0x5048b94e,
+    0x50493961,
+    0x5049b976,
+    0x504a398c,
+    0x504ab99c,
+    0x504b39bc,
+    0x504bb9cf,
+    0x504c39f2,
+    0x504cba20,
+    0x504d3a4d,
+    0x504dba6a,
+    0x504e3a85,
+    0x504ebaa1,
+    0x504f3ab3,
+    0x504fbaca,
+    0x50503ad9,
+    0x50508729,
+    0x50513aec,
+    0x5051b88a,
+    0x50523a32,
+    0x58320fd1,
+    0x68320d4e,
+    0x68328ceb,
+    0x68330cfe,
+    0x68338d5c,
+    0x68340d6c,
+    0x683480f7,
+    0x6835099a,
+    0x6c320d14,
+    0x6c328cb5,
+    0x6c330d1f,
+    0x6c338d38,
+    0x74320a66,
+    0x743280b9,
+    0x74330cc6,
+    0x783209cb,
+    0x783289e0,
+    0x783309ec,
+    0x78338090,
+    0x783409fb,
+    0x78348a10,
+    0x78350a2f,
+    0x78358a51,
+    0x78360a66,
+    0x78368a7c,
+    0x78370a8c,
+    0x78378aad,
+    0x78380ac0,
+    0x78388ad2,
+    0x78390adf,
+    0x78398afe,
+    0x783a0b13,
+    0x783a8b21,
+    0x783b0b2b,
+    0x783b8b3f,
+    0x783c0b56,
+    0x783c8b6b,
+    0x783d0b82,
+    0x783d8b97,
+    0x783e0aed,
+    0x783e8a9f,
+    0x7c321261,
+    0x8032148f,
+    0x80328090,
+    0x803332b6,
+    0x803380b9,
+    0x803432c5,
+    0x8034b22d,
+    0x8035324b,
+    0x8035b2d9,
+    0x8036328d,
+    0x8036b23c,
+    0x8037327f,
+    0x8037b21a,
+    0x803832a0,
+    0x8038b25c,
+    0x80393271,
+};
+
+const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
+
+const char kOpenSSLReasonStringData[] =
+    "ASN1_LENGTH_MISMATCH\0"
+    "AUX_ERROR\0"
+    "BAD_GET_ASN1_OBJECT_CALL\0"
+    "BAD_OBJECT_HEADER\0"
+    "BAD_TEMPLATE\0"
+    "BMPSTRING_IS_WRONG_LENGTH\0"
+    "BN_LIB\0"
+    "BOOLEAN_IS_WRONG_LENGTH\0"
+    "BUFFER_TOO_SMALL\0"
+    "CONTEXT_NOT_INITIALISED\0"
+    "DECODE_ERROR\0"
+    "DEPTH_EXCEEDED\0"
+    "DIGEST_AND_KEY_TYPE_NOT_SUPPORTED\0"
+    "ENCODE_ERROR\0"
+    "ERROR_GETTING_TIME\0"
+    "EXPECTING_AN_ASN1_SEQUENCE\0"
+    "EXPECTING_AN_INTEGER\0"
+    "EXPECTING_AN_OBJECT\0"
+    "EXPECTING_A_BOOLEAN\0"
+    "EXPECTING_A_TIME\0"
+    "EXPLICIT_LENGTH_MISMATCH\0"
+    "EXPLICIT_TAG_NOT_CONSTRUCTED\0"
+    "FIELD_MISSING\0"
+    "FIRST_NUM_TOO_LARGE\0"
+    "HEADER_TOO_LONG\0"
+    "ILLEGAL_BITSTRING_FORMAT\0"
+    "ILLEGAL_BOOLEAN\0"
+    "ILLEGAL_CHARACTERS\0"
+    "ILLEGAL_FORMAT\0"
+    "ILLEGAL_HEX\0"
+    "ILLEGAL_IMPLICIT_TAG\0"
+    "ILLEGAL_INTEGER\0"
+    "ILLEGAL_NESTED_TAGGING\0"
+    "ILLEGAL_NULL\0"
+    "ILLEGAL_NULL_VALUE\0"
+    "ILLEGAL_OBJECT\0"
+    "ILLEGAL_OPTIONAL_ANY\0"
+    "ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE\0"
+    "ILLEGAL_TAGGED_ANY\0"
+    "ILLEGAL_TIME_VALUE\0"
+    "INTEGER_NOT_ASCII_FORMAT\0"
+    "INTEGER_TOO_LARGE_FOR_LONG\0"
+    "INVALID_BIT_STRING_BITS_LEFT\0"
+    "INVALID_BIT_STRING_PADDING\0"
+    "INVALID_BMPSTRING\0"
+    "INVALID_DIGIT\0"
+    "INVALID_INTEGER\0"
+    "INVALID_MODIFIER\0"
+    "INVALID_NUMBER\0"
+    "INVALID_OBJECT_ENCODING\0"
+    "INVALID_SEPARATOR\0"
+    "INVALID_TIME_FORMAT\0"
+    "INVALID_UNIVERSALSTRING\0"
+    "INVALID_UTF8STRING\0"
+    "LIST_ERROR\0"
+    "MISSING_ASN1_EOS\0"
+    "MISSING_EOC\0"
+    "MISSING_SECOND_NUMBER\0"
+    "MISSING_VALUE\0"
+    "MSTRING_NOT_UNIVERSAL\0"
+    "MSTRING_WRONG_TAG\0"
+    "NESTED_ASN1_ERROR\0"
+    "NESTED_ASN1_STRING\0"
+    "NESTED_TOO_DEEP\0"
+    "NON_HEX_CHARACTERS\0"
+    "NOT_ASCII_FORMAT\0"
+    "NOT_ENOUGH_DATA\0"
+    "NO_MATCHING_CHOICE_TYPE\0"
+    "NULL_IS_WRONG_LENGTH\0"
+    "OBJECT_NOT_ASCII_FORMAT\0"
+    "ODD_NUMBER_OF_CHARS\0"
+    "SECOND_NUMBER_TOO_LARGE\0"
+    "SEQUENCE_LENGTH_MISMATCH\0"
+    "SEQUENCE_NOT_CONSTRUCTED\0"
+    "SEQUENCE_OR_SET_NEEDS_CONFIG\0"
+    "SHORT_LINE\0"
+    "STREAMING_NOT_SUPPORTED\0"
+    "STRING_TOO_LONG\0"
+    "STRING_TOO_SHORT\0"
+    "TAG_VALUE_TOO_HIGH\0"
+    "TIME_NOT_ASCII_FORMAT\0"
+    "TOO_LONG\0"
+    "TYPE_NOT_CONSTRUCTED\0"
+    "TYPE_NOT_PRIMITIVE\0"
+    "UNEXPECTED_EOC\0"
+    "UNIVERSALSTRING_IS_WRONG_LENGTH\0"
+    "UNKNOWN_FORMAT\0"
+    "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0"
+    "UNKNOWN_SIGNATURE_ALGORITHM\0"
+    "UNKNOWN_TAG\0"
+    "UNSUPPORTED_ANY_DEFINED_BY_TYPE\0"
+    "UNSUPPORTED_PUBLIC_KEY_TYPE\0"
+    "UNSUPPORTED_TYPE\0"
+    "WRONG_INTEGER_TYPE\0"
+    "WRONG_PUBLIC_KEY_TYPE\0"
+    "WRONG_TAG\0"
+    "WRONG_TYPE\0"
+    "BAD_FOPEN_MODE\0"
+    "BROKEN_PIPE\0"
+    "CONNECT_ERROR\0"
+    "ERROR_SETTING_NBIO\0"
+    "INVALID_ARGUMENT\0"
+    "IN_USE\0"
+    "KEEPALIVE\0"
+    "NBIO_CONNECT_ERROR\0"
+    "NO_HOSTNAME_SPECIFIED\0"
+    "NO_PORT_SPECIFIED\0"
+    "NO_SUCH_FILE\0"
+    "NULL_PARAMETER\0"
+    "SYS_LIB\0"
+    "UNABLE_TO_CREATE_SOCKET\0"
+    "UNINITIALIZED\0"
+    "UNSUPPORTED_METHOD\0"
+    "WRITE_TO_READ_ONLY_BIO\0"
+    "ARG2_LT_ARG3\0"
+    "BAD_ENCODING\0"
+    "BAD_RECIPROCAL\0"
+    "BIGNUM_TOO_LONG\0"
+    "BITS_TOO_SMALL\0"
+    "CALLED_WITH_EVEN_MODULUS\0"
+    "DIV_BY_ZERO\0"
+    "EXPAND_ON_STATIC_BIGNUM_DATA\0"
+    "INPUT_NOT_REDUCED\0"
+    "INVALID_INPUT\0"
+    "INVALID_RANGE\0"
+    "NEGATIVE_NUMBER\0"
+    "NOT_A_SQUARE\0"
+    "NOT_INITIALIZED\0"
+    "NO_INVERSE\0"
+    "PRIVATE_KEY_TOO_LARGE\0"
+    "P_IS_NOT_PRIME\0"
+    "TOO_MANY_ITERATIONS\0"
+    "TOO_MANY_TEMPORARY_VARIABLES\0"
+    "AES_KEY_SETUP_FAILED\0"
+    "BAD_DECRYPT\0"
+    "BAD_KEY_LENGTH\0"
+    "CTRL_NOT_IMPLEMENTED\0"
+    "CTRL_OPERATION_NOT_IMPLEMENTED\0"
+    "DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH\0"
+    "INITIALIZATION_ERROR\0"
+    "INPUT_NOT_INITIALIZED\0"
+    "INVALID_AD_SIZE\0"
+    "INVALID_KEY_LENGTH\0"
+    "INVALID_NONCE\0"
+    "INVALID_NONCE_SIZE\0"
+    "INVALID_OPERATION\0"
+    "IV_TOO_LARGE\0"
+    "NO_CIPHER_SET\0"
+    "NO_DIRECTION_SET\0"
+    "OUTPUT_ALIASES_INPUT\0"
+    "TAG_TOO_LARGE\0"
+    "TOO_LARGE\0"
+    "UNSUPPORTED_AD_SIZE\0"
+    "UNSUPPORTED_INPUT_SIZE\0"
+    "UNSUPPORTED_KEY_SIZE\0"
+    "UNSUPPORTED_NONCE_SIZE\0"
+    "UNSUPPORTED_TAG_SIZE\0"
+    "WRONG_FINAL_BLOCK_LENGTH\0"
+    "LIST_CANNOT_BE_NULL\0"
+    "MISSING_CLOSE_SQUARE_BRACKET\0"
+    "MISSING_EQUAL_SIGN\0"
+    "NO_CLOSE_BRACE\0"
+    "UNABLE_TO_CREATE_NEW_SECTION\0"
+    "VARIABLE_EXPANSION_NOT_SUPPORTED\0"
+    "VARIABLE_EXPANSION_TOO_LONG\0"
+    "VARIABLE_HAS_NO_VALUE\0"
+    "BAD_GENERATOR\0"
+    "INVALID_PARAMETERS\0"
+    "INVALID_PUBKEY\0"
+    "MODULUS_TOO_LARGE\0"
+    "NO_PRIVATE_VALUE\0"
+    "UNKNOWN_HASH\0"
+    "BAD_Q_VALUE\0"
+    "BAD_VERSION\0"
+    "MISSING_PARAMETERS\0"
+    "NEED_NEW_SETUP_VALUES\0"
+    "KDF_FAILED\0"
+    "POINT_ARITHMETIC_FAILURE\0"
+    "UNKNOWN_DIGEST_LENGTH\0"
+    "BAD_SIGNATURE\0"
+    "NOT_IMPLEMENTED\0"
+    "RANDOM_NUMBER_GENERATION_FAILED\0"
+    "BIGNUM_OUT_OF_RANGE\0"
+    "COORDINATES_OUT_OF_RANGE\0"
+    "D2I_ECPKPARAMETERS_FAILURE\0"
+    "EC_GROUP_NEW_BY_NAME_FAILURE\0"
+    "GROUP2PKPARAMETERS_FAILURE\0"
+    "GROUP_MISMATCH\0"
+    "I2D_ECPKPARAMETERS_FAILURE\0"
+    "INCOMPATIBLE_OBJECTS\0"
+    "INVALID_COFACTOR\0"
+    "INVALID_COMPRESSED_POINT\0"
+    "INVALID_COMPRESSION_BIT\0"
+    "INVALID_ENCODING\0"
+    "INVALID_FIELD\0"
+    "INVALID_FORM\0"
+    "INVALID_GROUP_ORDER\0"
+    "INVALID_PRIVATE_KEY\0"
+    "INVALID_SCALAR\0"
+    "MISSING_PRIVATE_KEY\0"
+    "NON_NAMED_CURVE\0"
+    "PKPARAMETERS2GROUP_FAILURE\0"
+    "POINT_AT_INFINITY\0"
+    "POINT_IS_NOT_ON_CURVE\0"
+    "PUBLIC_KEY_VALIDATION_FAILED\0"
+    "SLOT_FULL\0"
+    "UNDEFINED_GENERATOR\0"
+    "UNKNOWN_GROUP\0"
+    "UNKNOWN_ORDER\0"
+    "WRONG_CURVE_PARAMETERS\0"
+    "WRONG_ORDER\0"
+    "OPERATION_NOT_SUPPORTED\0"
+    "COMMAND_NOT_SUPPORTED\0"
+    "DIFFERENT_KEY_TYPES\0"
+    "DIFFERENT_PARAMETERS\0"
+    "EMPTY_PSK\0"
+    "EXPECTING_AN_EC_KEY_KEY\0"
+    "EXPECTING_AN_RSA_KEY\0"
+    "EXPECTING_A_DSA_KEY\0"
+    "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0"
+    "INVALID_BUFFER_SIZE\0"
+    "INVALID_DIGEST_LENGTH\0"
+    "INVALID_DIGEST_TYPE\0"
+    "INVALID_KEYBITS\0"
+    "INVALID_MGF1_MD\0"
+    "INVALID_PADDING_MODE\0"
+    "INVALID_PEER_KEY\0"
+    "INVALID_PSS_SALTLEN\0"
+    "INVALID_SIGNATURE\0"
+    "KEYS_NOT_SET\0"
+    "MEMORY_LIMIT_EXCEEDED\0"
+    "NOT_A_PRIVATE_KEY\0"
+    "NOT_XOF_OR_INVALID_LENGTH\0"
+    "NO_DEFAULT_DIGEST\0"
+    "NO_KEY_SET\0"
+    "NO_MDC2_SUPPORT\0"
+    "NO_NID_FOR_CURVE\0"
+    "NO_OPERATION_SET\0"
+    "NO_PARAMETERS_SET\0"
+    "OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE\0"
+    "OPERATON_NOT_INITIALIZED\0"
+    "UNKNOWN_PUBLIC_KEY_TYPE\0"
+    "UNSUPPORTED_ALGORITHM\0"
+    "OUTPUT_TOO_LARGE\0"
+    "INVALID_OID_STRING\0"
+    "UNKNOWN_NID\0"
+    "BAD_BASE64_DECODE\0"
+    "BAD_END_LINE\0"
+    "BAD_IV_CHARS\0"
+    "BAD_PASSWORD_READ\0"
+    "CIPHER_IS_NULL\0"
+    "ERROR_CONVERTING_PRIVATE_KEY\0"
+    "NOT_DEK_INFO\0"
+    "NOT_ENCRYPTED\0"
+    "NOT_PROC_TYPE\0"
+    "NO_START_LINE\0"
+    "READ_KEY\0"
+    "SHORT_HEADER\0"
+    "UNSUPPORTED_CIPHER\0"
+    "UNSUPPORTED_ENCRYPTION\0"
+    "BAD_PKCS7_VERSION\0"
+    "NOT_PKCS7_SIGNED_DATA\0"
+    "NO_CERTIFICATES_INCLUDED\0"
+    "NO_CRLS_INCLUDED\0"
+    "AMBIGUOUS_FRIENDLY_NAME\0"
+    "BAD_ITERATION_COUNT\0"
+    "BAD_PKCS12_DATA\0"
+    "BAD_PKCS12_VERSION\0"
+    "CIPHER_HAS_NO_OBJECT_IDENTIFIER\0"
+    "CRYPT_ERROR\0"
+    "ENCRYPT_ERROR\0"
+    "ERROR_SETTING_CIPHER_PARAMS\0"
+    "INCORRECT_PASSWORD\0"
+    "INVALID_CHARACTERS\0"
+    "KEYGEN_FAILURE\0"
+    "KEY_GEN_ERROR\0"
+    "METHOD_NOT_SUPPORTED\0"
+    "MISSING_MAC\0"
+    "MULTIPLE_PRIVATE_KEYS_IN_PKCS12\0"
+    "PKCS12_PUBLIC_KEY_INTEGRITY_NOT_SUPPORTED\0"
+    "PKCS12_TOO_DEEPLY_NESTED\0"
+    "PRIVATE_KEY_DECODE_ERROR\0"
+    "PRIVATE_KEY_ENCODE_ERROR\0"
+    "UNKNOWN_ALGORITHM\0"
+    "UNKNOWN_CIPHER\0"
+    "UNKNOWN_CIPHER_ALGORITHM\0"
+    "UNKNOWN_DIGEST\0"
+    "UNSUPPORTED_KEYLENGTH\0"
+    "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0"
+    "UNSUPPORTED_OPTIONS\0"
+    "UNSUPPORTED_PRF\0"
+    "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0"
+    "UNSUPPORTED_SALT_TYPE\0"
+    "BAD_E_VALUE\0"
+    "BAD_FIXED_HEADER_DECRYPT\0"
+    "BAD_PAD_BYTE_COUNT\0"
+    "BAD_RSA_PARAMETERS\0"
+    "BLOCK_TYPE_IS_NOT_01\0"
+    "BLOCK_TYPE_IS_NOT_02\0"
+    "BN_NOT_INITIALIZED\0"
+    "CANNOT_RECOVER_MULTI_PRIME_KEY\0"
+    "CRT_PARAMS_ALREADY_GIVEN\0"
+    "CRT_VALUES_INCORRECT\0"
+    "DATA_LEN_NOT_EQUAL_TO_MOD_LEN\0"
+    "DATA_TOO_LARGE\0"
+    "DATA_TOO_LARGE_FOR_KEY_SIZE\0"
+    "DATA_TOO_LARGE_FOR_MODULUS\0"
+    "DATA_TOO_SMALL\0"
+    "DATA_TOO_SMALL_FOR_KEY_SIZE\0"
+    "DIGEST_TOO_BIG_FOR_RSA_KEY\0"
+    "D_E_NOT_CONGRUENT_TO_1\0"
+    "D_OUT_OF_RANGE\0"
+    "EMPTY_PUBLIC_KEY\0"
+    "FIRST_OCTET_INVALID\0"
+    "INCONSISTENT_SET_OF_CRT_VALUES\0"
+    "INTERNAL_ERROR\0"
+    "INVALID_MESSAGE_LENGTH\0"
+    "KEY_SIZE_TOO_SMALL\0"
+    "LAST_OCTET_INVALID\0"
+    "MUST_HAVE_AT_LEAST_TWO_PRIMES\0"
+    "NO_PUBLIC_EXPONENT\0"
+    "NULL_BEFORE_BLOCK_MISSING\0"
+    "N_NOT_EQUAL_P_Q\0"
+    "OAEP_DECODING_ERROR\0"
+    "ONLY_ONE_OF_P_Q_GIVEN\0"
+    "OUTPUT_BUFFER_TOO_SMALL\0"
+    "PADDING_CHECK_FAILED\0"
+    "PKCS_DECODING_ERROR\0"
+    "SLEN_CHECK_FAILED\0"
+    "SLEN_RECOVERY_FAILED\0"
+    "UNKNOWN_ALGORITHM_TYPE\0"
+    "UNKNOWN_PADDING_TYPE\0"
+    "VALUE_MISSING\0"
+    "WRONG_SIGNATURE_LENGTH\0"
+    "ALPN_MISMATCH_ON_EARLY_DATA\0"
+    "ALPS_MISMATCH_ON_EARLY_DATA\0"
+    "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0"
+    "APPLICATION_DATA_ON_SHUTDOWN\0"
+    "APP_DATA_IN_HANDSHAKE\0"
+    "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0"
+    "BAD_ALERT\0"
+    "BAD_CHANGE_CIPHER_SPEC\0"
+    "BAD_DATA_RETURNED_BY_CALLBACK\0"
+    "BAD_DH_P_LENGTH\0"
+    "BAD_DIGEST_LENGTH\0"
+    "BAD_ECC_CERT\0"
+    "BAD_ECPOINT\0"
+    "BAD_HANDSHAKE_RECORD\0"
+    "BAD_HELLO_REQUEST\0"
+    "BAD_LENGTH\0"
+    "BAD_PACKET_LENGTH\0"
+    "BAD_RSA_ENCRYPT\0"
+    "BAD_SRTP_MKI_VALUE\0"
+    "BAD_SRTP_PROTECTION_PROFILE_LIST\0"
+    "BAD_SSL_FILETYPE\0"
+    "BAD_WRITE_RETRY\0"
+    "BIO_NOT_SET\0"
+    "BLOCK_CIPHER_PAD_IS_WRONG\0"
+    "CANNOT_HAVE_BOTH_PRIVKEY_AND_METHOD\0"
+    "CANNOT_PARSE_LEAF_CERT\0"
+    "CA_DN_LENGTH_MISMATCH\0"
+    "CA_DN_TOO_LONG\0"
+    "CCS_RECEIVED_EARLY\0"
+    "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0"
+    "CERTIFICATE_VERIFY_FAILED\0"
+    "CERT_CB_ERROR\0"
+    "CERT_DECOMPRESSION_FAILED\0"
+    "CERT_LENGTH_MISMATCH\0"
+    "CHANNEL_ID_NOT_P256\0"
+    "CHANNEL_ID_SIGNATURE_INVALID\0"
+    "CIPHER_MISMATCH_ON_EARLY_DATA\0"
+    "CIPHER_OR_HASH_UNAVAILABLE\0"
+    "CLIENTHELLO_PARSE_FAILED\0"
+    "CLIENTHELLO_TLSEXT\0"
+    "CONNECTION_REJECTED\0"
+    "CONNECTION_TYPE_NOT_SET\0"
+    "COULD_NOT_PARSE_HINTS\0"
+    "CUSTOM_EXTENSION_ERROR\0"
+    "DATA_LENGTH_TOO_LONG\0"
+    "DECRYPTION_FAILED\0"
+    "DECRYPTION_FAILED_OR_BAD_RECORD_MAC\0"
+    "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0"
+    "DH_P_TOO_LONG\0"
+    "DIGEST_CHECK_FAILED\0"
+    "DOWNGRADE_DETECTED\0"
+    "DTLS_MESSAGE_TOO_BIG\0"
+    "DUPLICATE_EXTENSION\0"
+    "DUPLICATE_KEY_SHARE\0"
+    "DUPLICATE_SIGNATURE_ALGORITHM\0"
+    "EARLY_DATA_NOT_IN_USE\0"
+    "ECC_CERT_NOT_FOR_SIGNING\0"
+    "ECH_REJECTED\0"
+    "ECH_SERVER_CONFIG_AND_PRIVATE_KEY_MISMATCH\0"
+    "ECH_SERVER_CONFIG_UNSUPPORTED_EXTENSION\0"
+    "ECH_SERVER_WOULD_HAVE_NO_RETRY_CONFIGS\0"
+    "EMPTY_HELLO_RETRY_REQUEST\0"
+    "EMS_STATE_INCONSISTENT\0"
+    "ENCRYPTED_LENGTH_TOO_LONG\0"
+    "ERROR_ADDING_EXTENSION\0"
+    "ERROR_IN_RECEIVED_CIPHER_LIST\0"
+    "ERROR_PARSING_EXTENSION\0"
+    "EXCESSIVE_MESSAGE_SIZE\0"
+    "EXCESS_HANDSHAKE_DATA\0"
+    "EXTRA_DATA_IN_MESSAGE\0"
+    "FRAGMENT_MISMATCH\0"
+    "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0"
+    "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0"
+    "HANDSHAKE_NOT_COMPLETE\0"
+    "HTTPS_PROXY_REQUEST\0"
+    "HTTP_REQUEST\0"
+    "INAPPROPRIATE_FALLBACK\0"
+    "INCONSISTENT_CLIENT_HELLO\0"
+    "INCONSISTENT_ECH_NEGOTIATION\0"
+    "INVALID_ALPN_PROTOCOL\0"
+    "INVALID_ALPN_PROTOCOL_LIST\0"
+    "INVALID_ALPS_CODEPOINT\0"
+    "INVALID_CLIENT_HELLO_INNER\0"
+    "INVALID_COMMAND\0"
+    "INVALID_COMPRESSION_LIST\0"
+    "INVALID_DELEGATED_CREDENTIAL\0"
+    "INVALID_ECH_CONFIG_LIST\0"
+    "INVALID_ECH_PUBLIC_NAME\0"
+    "INVALID_MESSAGE\0"
+    "INVALID_OUTER_EXTENSION\0"
+    "INVALID_OUTER_RECORD_TYPE\0"
+    "INVALID_SCT_LIST\0"
+    "INVALID_SIGNATURE_ALGORITHM\0"
+    "INVALID_SSL_SESSION\0"
+    "INVALID_TICKET_KEYS_LENGTH\0"
+    "KEY_USAGE_BIT_INCORRECT\0"
+    "LENGTH_MISMATCH\0"
+    "MISSING_EXTENSION\0"
+    "MISSING_KEY_SHARE\0"
+    "MISSING_RSA_CERTIFICATE\0"
+    "MISSING_TMP_DH_KEY\0"
+    "MISSING_TMP_ECDH_KEY\0"
+    "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0"
+    "MTU_TOO_SMALL\0"
+    "NEGOTIATED_ALPS_WITHOUT_ALPN\0"
+    "NEGOTIATED_BOTH_NPN_AND_ALPN\0"
+    "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0"
+    "NESTED_GROUP\0"
+    "NO_APPLICATION_PROTOCOL\0"
+    "NO_CERTIFICATES_RETURNED\0"
+    "NO_CERTIFICATE_ASSIGNED\0"
+    "NO_CERTIFICATE_SET\0"
+    "NO_CIPHERS_AVAILABLE\0"
+    "NO_CIPHERS_PASSED\0"
+    "NO_CIPHERS_SPECIFIED\0"
+    "NO_CIPHER_MATCH\0"
+    "NO_COMMON_SIGNATURE_ALGORITHMS\0"
+    "NO_COMPRESSION_SPECIFIED\0"
+    "NO_GROUPS_SPECIFIED\0"
+    "NO_METHOD_SPECIFIED\0"
+    "NO_PRIVATE_KEY_ASSIGNED\0"
+    "NO_RENEGOTIATION\0"
+    "NO_REQUIRED_DIGEST\0"
+    "NO_SHARED_CIPHER\0"
+    "NO_SHARED_GROUP\0"
+    "NO_SUPPORTED_VERSIONS_ENABLED\0"
+    "NULL_SSL_CTX\0"
+    "NULL_SSL_METHOD_PASSED\0"
+    "OCSP_CB_ERROR\0"
+    "OLD_SESSION_CIPHER_NOT_RETURNED\0"
+    "OLD_SESSION_PRF_HASH_MISMATCH\0"
+    "OLD_SESSION_VERSION_NOT_RETURNED\0"
+    "PARSE_TLSEXT\0"
+    "PATH_TOO_LONG\0"
+    "PEER_DID_NOT_RETURN_A_CERTIFICATE\0"
+    "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0"
+    "PRE_SHARED_KEY_MUST_BE_LAST\0"
+    "PRIVATE_KEY_OPERATION_FAILED\0"
+    "PROTOCOL_IS_SHUTDOWN\0"
+    "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0"
+    "PSK_IDENTITY_NOT_FOUND\0"
+    "PSK_NO_CLIENT_CB\0"
+    "PSK_NO_SERVER_CB\0"
+    "QUIC_INTERNAL_ERROR\0"
+    "QUIC_TRANSPORT_PARAMETERS_MISCONFIGURED\0"
+    "READ_TIMEOUT_EXPIRED\0"
+    "RECORD_LENGTH_MISMATCH\0"
+    "RECORD_TOO_LARGE\0"
+    "RENEGOTIATION_EMS_MISMATCH\0"
+    "RENEGOTIATION_ENCODING_ERR\0"
+    "RENEGOTIATION_MISMATCH\0"
+    "REQUIRED_CIPHER_MISSING\0"
+    "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0"
+    "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0"
+    "SCSV_RECEIVED_WHEN_RENEGOTIATING\0"
+    "SECOND_SERVERHELLO_VERSION_MISMATCH\0"
+    "SERVERHELLO_TLSEXT\0"
+    "SERVER_CERT_CHANGED\0"
+    "SERVER_ECHOED_INVALID_SESSION_ID\0"
+    "SESSION_ID_CONTEXT_UNINITIALIZED\0"
+    "SESSION_MAY_NOT_BE_CREATED\0"
+    "SHUTDOWN_WHILE_IN_INIT\0"
+    "SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER\0"
+    "SRTP_COULD_NOT_ALLOCATE_PROFILES\0"
+    "SRTP_UNKNOWN_PROTECTION_PROFILE\0"
+    "SSL3_EXT_INVALID_SERVERNAME\0"
+    "SSLV3_ALERT_BAD_CERTIFICATE\0"
+    "SSLV3_ALERT_BAD_RECORD_MAC\0"
+    "SSLV3_ALERT_CERTIFICATE_EXPIRED\0"
+    "SSLV3_ALERT_CERTIFICATE_REVOKED\0"
+    "SSLV3_ALERT_CERTIFICATE_UNKNOWN\0"
+    "SSLV3_ALERT_CLOSE_NOTIFY\0"
+    "SSLV3_ALERT_DECOMPRESSION_FAILURE\0"
+    "SSLV3_ALERT_HANDSHAKE_FAILURE\0"
+    "SSLV3_ALERT_ILLEGAL_PARAMETER\0"
+    "SSLV3_ALERT_NO_CERTIFICATE\0"
+    "SSLV3_ALERT_UNEXPECTED_MESSAGE\0"
+    "SSLV3_ALERT_UNSUPPORTED_CERTIFICATE\0"
+    "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0"
+    "SSL_HANDSHAKE_FAILURE\0"
+    "SSL_SESSION_ID_CONTEXT_TOO_LONG\0"
+    "SSL_SESSION_ID_TOO_LONG\0"
+    "TICKET_ENCRYPTION_FAILED\0"
+    "TLS13_DOWNGRADE\0"
+    "TLSV1_ALERT_ACCESS_DENIED\0"
+    "TLSV1_ALERT_BAD_CERTIFICATE_HASH_VALUE\0"
+    "TLSV1_ALERT_BAD_CERTIFICATE_STATUS_RESPONSE\0"
+    "TLSV1_ALERT_CERTIFICATE_REQUIRED\0"
+    "TLSV1_ALERT_CERTIFICATE_UNOBTAINABLE\0"
+    "TLSV1_ALERT_DECODE_ERROR\0"
+    "TLSV1_ALERT_DECRYPTION_FAILED\0"
+    "TLSV1_ALERT_DECRYPT_ERROR\0"
+    "TLSV1_ALERT_ECH_REQUIRED\0"
+    "TLSV1_ALERT_EXPORT_RESTRICTION\0"
+    "TLSV1_ALERT_INAPPROPRIATE_FALLBACK\0"
+    "TLSV1_ALERT_INSUFFICIENT_SECURITY\0"
+    "TLSV1_ALERT_INTERNAL_ERROR\0"
+    "TLSV1_ALERT_NO_APPLICATION_PROTOCOL\0"
+    "TLSV1_ALERT_NO_RENEGOTIATION\0"
+    "TLSV1_ALERT_PROTOCOL_VERSION\0"
+    "TLSV1_ALERT_RECORD_OVERFLOW\0"
+    "TLSV1_ALERT_UNKNOWN_CA\0"
+    "TLSV1_ALERT_UNKNOWN_PSK_IDENTITY\0"
+    "TLSV1_ALERT_UNRECOGNIZED_NAME\0"
+    "TLSV1_ALERT_UNSUPPORTED_EXTENSION\0"
+    "TLSV1_ALERT_USER_CANCELLED\0"
+    "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0"
+    "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0"
+    "TOO_MANY_EMPTY_FRAGMENTS\0"
+    "TOO_MANY_KEY_UPDATES\0"
+    "TOO_MANY_WARNING_ALERTS\0"
+    "TOO_MUCH_READ_EARLY_DATA\0"
+    "TOO_MUCH_SKIPPED_EARLY_DATA\0"
+    "UNABLE_TO_FIND_ECDH_PARAMETERS\0"
+    "UNCOMPRESSED_CERT_TOO_LARGE\0"
+    "UNEXPECTED_COMPATIBILITY_MODE\0"
+    "UNEXPECTED_EXTENSION\0"
+    "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0"
+    "UNEXPECTED_MESSAGE\0"
+    "UNEXPECTED_OPERATOR_IN_GROUP\0"
+    "UNEXPECTED_RECORD\0"
+    "UNKNOWN_ALERT_TYPE\0"
+    "UNKNOWN_CERTIFICATE_TYPE\0"
+    "UNKNOWN_CERT_COMPRESSION_ALG\0"
+    "UNKNOWN_CIPHER_RETURNED\0"
+    "UNKNOWN_CIPHER_TYPE\0"
+    "UNKNOWN_KEY_EXCHANGE_TYPE\0"
+    "UNKNOWN_PROTOCOL\0"
+    "UNKNOWN_SSL_VERSION\0"
+    "UNKNOWN_STATE\0"
+    "UNSAFE_LEGACY_RENEGOTIATION_DISABLED\0"
+    "UNSUPPORTED_COMPRESSION_ALGORITHM\0"
+    "UNSUPPORTED_ECH_SERVER_CONFIG\0"
+    "UNSUPPORTED_ELLIPTIC_CURVE\0"
+    "UNSUPPORTED_PROTOCOL\0"
+    "UNSUPPORTED_PROTOCOL_FOR_CUSTOM_KEY\0"
+    "WRONG_CERTIFICATE_TYPE\0"
+    "WRONG_CIPHER_RETURNED\0"
+    "WRONG_CURVE\0"
+    "WRONG_ENCRYPTION_LEVEL_RECEIVED\0"
+    "WRONG_MESSAGE_TYPE\0"
+    "WRONG_SIGNATURE_TYPE\0"
+    "WRONG_SSL_VERSION\0"
+    "WRONG_VERSION_NUMBER\0"
+    "WRONG_VERSION_ON_EARLY_DATA\0"
+    "X509_LIB\0"
+    "X509_VERIFICATION_SETUP_PROBLEMS\0"
+    "BAD_VALIDITY_CHECK\0"
+    "DECODE_FAILURE\0"
+    "INVALID_KEY_ID\0"
+    "INVALID_METADATA\0"
+    "INVALID_METADATA_KEY\0"
+    "INVALID_PROOF\0"
+    "INVALID_TOKEN\0"
+    "NO_KEYS_CONFIGURED\0"
+    "NO_SRR_KEY_CONFIGURED\0"
+    "OVER_BATCHSIZE\0"
+    "SRR_SIGNATURE_ERROR\0"
+    "TOO_MANY_KEYS\0"
+    "AKID_MISMATCH\0"
+    "BAD_X509_FILETYPE\0"
+    "BASE64_DECODE_ERROR\0"
+    "CANT_CHECK_DH_KEY\0"
+    "CERT_ALREADY_IN_HASH_TABLE\0"
+    "CRL_ALREADY_DELTA\0"
+    "CRL_VERIFY_FAILURE\0"
+    "DELTA_CRL_WITHOUT_CRL_NUMBER\0"
+    "IDP_MISMATCH\0"
+    "INVALID_DIRECTORY\0"
+    "INVALID_FIELD_FOR_VERSION\0"
+    "INVALID_FIELD_NAME\0"
+    "INVALID_PARAMETER\0"
+    "INVALID_POLICY_EXTENSION\0"
+    "INVALID_PSS_PARAMETERS\0"
+    "INVALID_TRUST\0"
+    "INVALID_VERSION\0"
+    "ISSUER_MISMATCH\0"
+    "KEY_TYPE_MISMATCH\0"
+    "KEY_VALUES_MISMATCH\0"
+    "LOADING_CERT_DIR\0"
+    "LOADING_DEFAULTS\0"
+    "NAME_TOO_LONG\0"
+    "NEWER_CRL_NOT_NEWER\0"
+    "NO_CERTIFICATE_FOUND\0"
+    "NO_CERTIFICATE_OR_CRL_FOUND\0"
+    "NO_CERT_SET_FOR_US_TO_VERIFY\0"
+    "NO_CRL_FOUND\0"
+    "NO_CRL_NUMBER\0"
+    "PUBLIC_KEY_DECODE_ERROR\0"
+    "PUBLIC_KEY_ENCODE_ERROR\0"
+    "SHOULD_RETRY\0"
+    "SIGNATURE_ALGORITHM_MISMATCH\0"
+    "UNKNOWN_KEY_TYPE\0"
+    "UNKNOWN_PURPOSE_ID\0"
+    "UNKNOWN_TRUST_ID\0"
+    "WRONG_LOOKUP_TYPE\0"
+    "BAD_IP_ADDRESS\0"
+    "BAD_OBJECT\0"
+    "BN_DEC2BN_ERROR\0"
+    "BN_TO_ASN1_INTEGER_ERROR\0"
+    "CANNOT_FIND_FREE_FUNCTION\0"
+    "DIRNAME_ERROR\0"
+    "DISTPOINT_ALREADY_SET\0"
+    "DUPLICATE_ZONE_ID\0"
+    "ERROR_CONVERTING_ZONE\0"
+    "ERROR_CREATING_EXTENSION\0"
+    "ERROR_IN_EXTENSION\0"
+    "EXPECTED_A_SECTION_NAME\0"
+    "EXTENSION_EXISTS\0"
+    "EXTENSION_NAME_ERROR\0"
+    "EXTENSION_NOT_FOUND\0"
+    "EXTENSION_SETTING_NOT_SUPPORTED\0"
+    "EXTENSION_VALUE_ERROR\0"
+    "ILLEGAL_EMPTY_EXTENSION\0"
+    "ILLEGAL_HEX_DIGIT\0"
+    "INCORRECT_POLICY_SYNTAX_TAG\0"
+    "INVALID_BOOLEAN_STRING\0"
+    "INVALID_EXTENSION_STRING\0"
+    "INVALID_MULTIPLE_RDNS\0"
+    "INVALID_NAME\0"
+    "INVALID_NULL_ARGUMENT\0"
+    "INVALID_NULL_NAME\0"
+    "INVALID_NULL_VALUE\0"
+    "INVALID_NUMBERS\0"
+    "INVALID_OBJECT_IDENTIFIER\0"
+    "INVALID_OPTION\0"
+    "INVALID_POLICY_IDENTIFIER\0"
+    "INVALID_PROXY_POLICY_SETTING\0"
+    "INVALID_PURPOSE\0"
+    "INVALID_SECTION\0"
+    "INVALID_SYNTAX\0"
+    "INVALID_VALUE\0"
+    "ISSUER_DECODE_ERROR\0"
+    "NEED_ORGANIZATION_AND_NUMBERS\0"
+    "NO_CONFIG_DATABASE\0"
+    "NO_ISSUER_CERTIFICATE\0"
+    "NO_ISSUER_DETAILS\0"
+    "NO_POLICY_IDENTIFIER\0"
+    "NO_PROXY_CERT_POLICY_LANGUAGE_DEFINED\0"
+    "NO_PUBLIC_KEY\0"
+    "NO_SUBJECT_DETAILS\0"
+    "ODD_NUMBER_OF_DIGITS\0"
+    "OPERATION_NOT_DEFINED\0"
+    "OTHERNAME_ERROR\0"
+    "POLICY_LANGUAGE_ALREADY_DEFINED\0"
+    "POLICY_PATH_LENGTH\0"
+    "POLICY_PATH_LENGTH_ALREADY_DEFINED\0"
+    "POLICY_WHEN_PROXY_LANGUAGE_REQUIRES_NO_POLICY\0"
+    "SECTION_NOT_FOUND\0"
+    "TRAILING_DATA_IN_EXTENSION\0"
+    "UNABLE_TO_GET_ISSUER_DETAILS\0"
+    "UNABLE_TO_GET_ISSUER_KEYID\0"
+    "UNKNOWN_BIT_STRING_ARGUMENT\0"
+    "UNKNOWN_EXTENSION\0"
+    "UNKNOWN_EXTENSION_NAME\0"
+    "UNKNOWN_OPTION\0"
+    "UNSUPPORTED_OPTION\0"
+    "USER_TOO_LONG\0"
+    "";
+
diff --git a/gen/sources.cmake b/gen/sources.cmake
new file mode 100644
index 0000000..03c5d3a
--- /dev/null
+++ b/gen/sources.cmake
@@ -0,0 +1,210 @@
+# Copyright (c) 2024, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+# Generated by go ./util/pregenerate. Do not edit manually.
+
+set(
+  BCM_SOURCES_ASM
+
+  gen/bcm/aesni-gcm-x86_64-apple.S
+  gen/bcm/aesni-gcm-x86_64-linux.S
+  gen/bcm/aesni-x86-apple.S
+  gen/bcm/aesni-x86-linux.S
+  gen/bcm/aesni-x86_64-apple.S
+  gen/bcm/aesni-x86_64-linux.S
+  gen/bcm/aesv8-armv7-linux.S
+  gen/bcm/aesv8-armv8-apple.S
+  gen/bcm/aesv8-armv8-linux.S
+  gen/bcm/aesv8-armv8-win.S
+  gen/bcm/aesv8-gcm-armv8-apple.S
+  gen/bcm/aesv8-gcm-armv8-linux.S
+  gen/bcm/aesv8-gcm-armv8-win.S
+  gen/bcm/armv4-mont-linux.S
+  gen/bcm/armv8-mont-apple.S
+  gen/bcm/armv8-mont-linux.S
+  gen/bcm/armv8-mont-win.S
+  gen/bcm/bn-586-apple.S
+  gen/bcm/bn-586-linux.S
+  gen/bcm/bn-armv8-apple.S
+  gen/bcm/bn-armv8-linux.S
+  gen/bcm/bn-armv8-win.S
+  gen/bcm/bsaes-armv7-linux.S
+  gen/bcm/co-586-apple.S
+  gen/bcm/co-586-linux.S
+  gen/bcm/ghash-armv4-linux.S
+  gen/bcm/ghash-neon-armv8-apple.S
+  gen/bcm/ghash-neon-armv8-linux.S
+  gen/bcm/ghash-neon-armv8-win.S
+  gen/bcm/ghash-ssse3-x86-apple.S
+  gen/bcm/ghash-ssse3-x86-linux.S
+  gen/bcm/ghash-ssse3-x86_64-apple.S
+  gen/bcm/ghash-ssse3-x86_64-linux.S
+  gen/bcm/ghash-x86-apple.S
+  gen/bcm/ghash-x86-linux.S
+  gen/bcm/ghash-x86_64-apple.S
+  gen/bcm/ghash-x86_64-linux.S
+  gen/bcm/ghashv8-armv7-linux.S
+  gen/bcm/ghashv8-armv8-apple.S
+  gen/bcm/ghashv8-armv8-linux.S
+  gen/bcm/ghashv8-armv8-win.S
+  gen/bcm/md5-586-apple.S
+  gen/bcm/md5-586-linux.S
+  gen/bcm/md5-x86_64-apple.S
+  gen/bcm/md5-x86_64-linux.S
+  gen/bcm/p256-armv8-asm-apple.S
+  gen/bcm/p256-armv8-asm-linux.S
+  gen/bcm/p256-armv8-asm-win.S
+  gen/bcm/p256-x86_64-asm-apple.S
+  gen/bcm/p256-x86_64-asm-linux.S
+  gen/bcm/p256_beeu-armv8-asm-apple.S
+  gen/bcm/p256_beeu-armv8-asm-linux.S
+  gen/bcm/p256_beeu-armv8-asm-win.S
+  gen/bcm/p256_beeu-x86_64-asm-apple.S
+  gen/bcm/p256_beeu-x86_64-asm-linux.S
+  gen/bcm/rdrand-x86_64-apple.S
+  gen/bcm/rdrand-x86_64-linux.S
+  gen/bcm/rsaz-avx2-apple.S
+  gen/bcm/rsaz-avx2-linux.S
+  gen/bcm/sha1-586-apple.S
+  gen/bcm/sha1-586-linux.S
+  gen/bcm/sha1-armv4-large-linux.S
+  gen/bcm/sha1-armv8-apple.S
+  gen/bcm/sha1-armv8-linux.S
+  gen/bcm/sha1-armv8-win.S
+  gen/bcm/sha1-x86_64-apple.S
+  gen/bcm/sha1-x86_64-linux.S
+  gen/bcm/sha256-586-apple.S
+  gen/bcm/sha256-586-linux.S
+  gen/bcm/sha256-armv4-linux.S
+  gen/bcm/sha256-armv8-apple.S
+  gen/bcm/sha256-armv8-linux.S
+  gen/bcm/sha256-armv8-win.S
+  gen/bcm/sha256-x86_64-apple.S
+  gen/bcm/sha256-x86_64-linux.S
+  gen/bcm/sha512-586-apple.S
+  gen/bcm/sha512-586-linux.S
+  gen/bcm/sha512-armv4-linux.S
+  gen/bcm/sha512-armv8-apple.S
+  gen/bcm/sha512-armv8-linux.S
+  gen/bcm/sha512-armv8-win.S
+  gen/bcm/sha512-x86_64-apple.S
+  gen/bcm/sha512-x86_64-linux.S
+  gen/bcm/vpaes-armv7-linux.S
+  gen/bcm/vpaes-armv8-apple.S
+  gen/bcm/vpaes-armv8-linux.S
+  gen/bcm/vpaes-armv8-win.S
+  gen/bcm/vpaes-x86-apple.S
+  gen/bcm/vpaes-x86-linux.S
+  gen/bcm/vpaes-x86_64-apple.S
+  gen/bcm/vpaes-x86_64-linux.S
+  gen/bcm/x86-mont-apple.S
+  gen/bcm/x86-mont-linux.S
+  gen/bcm/x86_64-mont-apple.S
+  gen/bcm/x86_64-mont-linux.S
+  gen/bcm/x86_64-mont5-apple.S
+  gen/bcm/x86_64-mont5-linux.S
+)
+
+set(
+  BCM_SOURCES_NASM
+
+  gen/bcm/aesni-gcm-x86_64-win.asm
+  gen/bcm/aesni-x86-win.asm
+  gen/bcm/aesni-x86_64-win.asm
+  gen/bcm/bn-586-win.asm
+  gen/bcm/co-586-win.asm
+  gen/bcm/ghash-ssse3-x86-win.asm
+  gen/bcm/ghash-ssse3-x86_64-win.asm
+  gen/bcm/ghash-x86-win.asm
+  gen/bcm/ghash-x86_64-win.asm
+  gen/bcm/md5-586-win.asm
+  gen/bcm/md5-x86_64-win.asm
+  gen/bcm/p256-x86_64-asm-win.asm
+  gen/bcm/p256_beeu-x86_64-asm-win.asm
+  gen/bcm/rdrand-x86_64-win.asm
+  gen/bcm/rsaz-avx2-win.asm
+  gen/bcm/sha1-586-win.asm
+  gen/bcm/sha1-x86_64-win.asm
+  gen/bcm/sha256-586-win.asm
+  gen/bcm/sha256-x86_64-win.asm
+  gen/bcm/sha512-586-win.asm
+  gen/bcm/sha512-x86_64-win.asm
+  gen/bcm/vpaes-x86-win.asm
+  gen/bcm/vpaes-x86_64-win.asm
+  gen/bcm/x86-mont-win.asm
+  gen/bcm/x86_64-mont-win.asm
+  gen/bcm/x86_64-mont5-win.asm
+)
+
+set(
+  CRYPTO_SOURCES
+
+  gen/crypto/err_data.c
+)
+
+set(
+  CRYPTO_SOURCES_ASM
+
+  crypto/curve25519/asm/x25519-asm-arm.S
+  crypto/hrss/asm/poly_rq_mul.S
+  crypto/poly1305/poly1305_arm_asm.S
+  gen/crypto/aes128gcmsiv-x86_64-apple.S
+  gen/crypto/aes128gcmsiv-x86_64-linux.S
+  gen/crypto/chacha-armv4-linux.S
+  gen/crypto/chacha-armv8-apple.S
+  gen/crypto/chacha-armv8-linux.S
+  gen/crypto/chacha-armv8-win.S
+  gen/crypto/chacha-x86-apple.S
+  gen/crypto/chacha-x86-linux.S
+  gen/crypto/chacha-x86_64-apple.S
+  gen/crypto/chacha-x86_64-linux.S
+  gen/crypto/chacha20_poly1305_armv8-apple.S
+  gen/crypto/chacha20_poly1305_armv8-linux.S
+  gen/crypto/chacha20_poly1305_armv8-win.S
+  gen/crypto/chacha20_poly1305_x86_64-apple.S
+  gen/crypto/chacha20_poly1305_x86_64-linux.S
+  third_party/fiat/asm/fiat_curve25519_adx_mul.S
+  third_party/fiat/asm/fiat_curve25519_adx_square.S
+  third_party/fiat/asm/fiat_p256_adx_mul.S
+  third_party/fiat/asm/fiat_p256_adx_sqr.S
+)
+
+set(
+  CRYPTO_SOURCES_NASM
+
+  gen/crypto/aes128gcmsiv-x86_64-win.asm
+  gen/crypto/chacha-x86-win.asm
+  gen/crypto/chacha-x86_64-win.asm
+  gen/crypto/chacha20_poly1305_x86_64-win.asm
+)
+
+set(
+  TEST_SUPPORT_SOURCES_ASM
+
+  gen/test_support/trampoline-armv4-linux.S
+  gen/test_support/trampoline-armv8-apple.S
+  gen/test_support/trampoline-armv8-linux.S
+  gen/test_support/trampoline-armv8-win.S
+  gen/test_support/trampoline-x86-apple.S
+  gen/test_support/trampoline-x86-linux.S
+  gen/test_support/trampoline-x86_64-apple.S
+  gen/test_support/trampoline-x86_64-linux.S
+)
+
+set(
+  TEST_SUPPORT_SOURCES_NASM
+
+  gen/test_support/trampoline-x86-win.asm
+  gen/test_support/trampoline-x86_64-win.asm
+)
diff --git a/gen/sources.json b/gen/sources.json
new file mode 100644
index 0000000..785ac73
--- /dev/null
+++ b/gen/sources.json
@@ -0,0 +1,182 @@
+{
+  "bcm": {
+    "asm": [
+      "gen/bcm/aesni-gcm-x86_64-apple.S",
+      "gen/bcm/aesni-gcm-x86_64-linux.S",
+      "gen/bcm/aesni-x86-apple.S",
+      "gen/bcm/aesni-x86-linux.S",
+      "gen/bcm/aesni-x86_64-apple.S",
+      "gen/bcm/aesni-x86_64-linux.S",
+      "gen/bcm/aesv8-armv7-linux.S",
+      "gen/bcm/aesv8-armv8-apple.S",
+      "gen/bcm/aesv8-armv8-linux.S",
+      "gen/bcm/aesv8-armv8-win.S",
+      "gen/bcm/aesv8-gcm-armv8-apple.S",
+      "gen/bcm/aesv8-gcm-armv8-linux.S",
+      "gen/bcm/aesv8-gcm-armv8-win.S",
+      "gen/bcm/armv4-mont-linux.S",
+      "gen/bcm/armv8-mont-apple.S",
+      "gen/bcm/armv8-mont-linux.S",
+      "gen/bcm/armv8-mont-win.S",
+      "gen/bcm/bn-586-apple.S",
+      "gen/bcm/bn-586-linux.S",
+      "gen/bcm/bn-armv8-apple.S",
+      "gen/bcm/bn-armv8-linux.S",
+      "gen/bcm/bn-armv8-win.S",
+      "gen/bcm/bsaes-armv7-linux.S",
+      "gen/bcm/co-586-apple.S",
+      "gen/bcm/co-586-linux.S",
+      "gen/bcm/ghash-armv4-linux.S",
+      "gen/bcm/ghash-neon-armv8-apple.S",
+      "gen/bcm/ghash-neon-armv8-linux.S",
+      "gen/bcm/ghash-neon-armv8-win.S",
+      "gen/bcm/ghash-ssse3-x86-apple.S",
+      "gen/bcm/ghash-ssse3-x86-linux.S",
+      "gen/bcm/ghash-ssse3-x86_64-apple.S",
+      "gen/bcm/ghash-ssse3-x86_64-linux.S",
+      "gen/bcm/ghash-x86-apple.S",
+      "gen/bcm/ghash-x86-linux.S",
+      "gen/bcm/ghash-x86_64-apple.S",
+      "gen/bcm/ghash-x86_64-linux.S",
+      "gen/bcm/ghashv8-armv7-linux.S",
+      "gen/bcm/ghashv8-armv8-apple.S",
+      "gen/bcm/ghashv8-armv8-linux.S",
+      "gen/bcm/ghashv8-armv8-win.S",
+      "gen/bcm/md5-586-apple.S",
+      "gen/bcm/md5-586-linux.S",
+      "gen/bcm/md5-x86_64-apple.S",
+      "gen/bcm/md5-x86_64-linux.S",
+      "gen/bcm/p256-armv8-asm-apple.S",
+      "gen/bcm/p256-armv8-asm-linux.S",
+      "gen/bcm/p256-armv8-asm-win.S",
+      "gen/bcm/p256-x86_64-asm-apple.S",
+      "gen/bcm/p256-x86_64-asm-linux.S",
+      "gen/bcm/p256_beeu-armv8-asm-apple.S",
+      "gen/bcm/p256_beeu-armv8-asm-linux.S",
+      "gen/bcm/p256_beeu-armv8-asm-win.S",
+      "gen/bcm/p256_beeu-x86_64-asm-apple.S",
+      "gen/bcm/p256_beeu-x86_64-asm-linux.S",
+      "gen/bcm/rdrand-x86_64-apple.S",
+      "gen/bcm/rdrand-x86_64-linux.S",
+      "gen/bcm/rsaz-avx2-apple.S",
+      "gen/bcm/rsaz-avx2-linux.S",
+      "gen/bcm/sha1-586-apple.S",
+      "gen/bcm/sha1-586-linux.S",
+      "gen/bcm/sha1-armv4-large-linux.S",
+      "gen/bcm/sha1-armv8-apple.S",
+      "gen/bcm/sha1-armv8-linux.S",
+      "gen/bcm/sha1-armv8-win.S",
+      "gen/bcm/sha1-x86_64-apple.S",
+      "gen/bcm/sha1-x86_64-linux.S",
+      "gen/bcm/sha256-586-apple.S",
+      "gen/bcm/sha256-586-linux.S",
+      "gen/bcm/sha256-armv4-linux.S",
+      "gen/bcm/sha256-armv8-apple.S",
+      "gen/bcm/sha256-armv8-linux.S",
+      "gen/bcm/sha256-armv8-win.S",
+      "gen/bcm/sha256-x86_64-apple.S",
+      "gen/bcm/sha256-x86_64-linux.S",
+      "gen/bcm/sha512-586-apple.S",
+      "gen/bcm/sha512-586-linux.S",
+      "gen/bcm/sha512-armv4-linux.S",
+      "gen/bcm/sha512-armv8-apple.S",
+      "gen/bcm/sha512-armv8-linux.S",
+      "gen/bcm/sha512-armv8-win.S",
+      "gen/bcm/sha512-x86_64-apple.S",
+      "gen/bcm/sha512-x86_64-linux.S",
+      "gen/bcm/vpaes-armv7-linux.S",
+      "gen/bcm/vpaes-armv8-apple.S",
+      "gen/bcm/vpaes-armv8-linux.S",
+      "gen/bcm/vpaes-armv8-win.S",
+      "gen/bcm/vpaes-x86-apple.S",
+      "gen/bcm/vpaes-x86-linux.S",
+      "gen/bcm/vpaes-x86_64-apple.S",
+      "gen/bcm/vpaes-x86_64-linux.S",
+      "gen/bcm/x86-mont-apple.S",
+      "gen/bcm/x86-mont-linux.S",
+      "gen/bcm/x86_64-mont-apple.S",
+      "gen/bcm/x86_64-mont-linux.S",
+      "gen/bcm/x86_64-mont5-apple.S",
+      "gen/bcm/x86_64-mont5-linux.S"
+    ],
+    "nasm": [
+      "gen/bcm/aesni-gcm-x86_64-win.asm",
+      "gen/bcm/aesni-x86-win.asm",
+      "gen/bcm/aesni-x86_64-win.asm",
+      "gen/bcm/bn-586-win.asm",
+      "gen/bcm/co-586-win.asm",
+      "gen/bcm/ghash-ssse3-x86-win.asm",
+      "gen/bcm/ghash-ssse3-x86_64-win.asm",
+      "gen/bcm/ghash-x86-win.asm",
+      "gen/bcm/ghash-x86_64-win.asm",
+      "gen/bcm/md5-586-win.asm",
+      "gen/bcm/md5-x86_64-win.asm",
+      "gen/bcm/p256-x86_64-asm-win.asm",
+      "gen/bcm/p256_beeu-x86_64-asm-win.asm",
+      "gen/bcm/rdrand-x86_64-win.asm",
+      "gen/bcm/rsaz-avx2-win.asm",
+      "gen/bcm/sha1-586-win.asm",
+      "gen/bcm/sha1-x86_64-win.asm",
+      "gen/bcm/sha256-586-win.asm",
+      "gen/bcm/sha256-x86_64-win.asm",
+      "gen/bcm/sha512-586-win.asm",
+      "gen/bcm/sha512-x86_64-win.asm",
+      "gen/bcm/vpaes-x86-win.asm",
+      "gen/bcm/vpaes-x86_64-win.asm",
+      "gen/bcm/x86-mont-win.asm",
+      "gen/bcm/x86_64-mont-win.asm",
+      "gen/bcm/x86_64-mont5-win.asm"
+    ]
+  },
+  "crypto": {
+    "srcs": [
+      "gen/crypto/err_data.c"
+    ],
+    "asm": [
+      "crypto/curve25519/asm/x25519-asm-arm.S",
+      "crypto/hrss/asm/poly_rq_mul.S",
+      "crypto/poly1305/poly1305_arm_asm.S",
+      "gen/crypto/aes128gcmsiv-x86_64-apple.S",
+      "gen/crypto/aes128gcmsiv-x86_64-linux.S",
+      "gen/crypto/chacha-armv4-linux.S",
+      "gen/crypto/chacha-armv8-apple.S",
+      "gen/crypto/chacha-armv8-linux.S",
+      "gen/crypto/chacha-armv8-win.S",
+      "gen/crypto/chacha-x86-apple.S",
+      "gen/crypto/chacha-x86-linux.S",
+      "gen/crypto/chacha-x86_64-apple.S",
+      "gen/crypto/chacha-x86_64-linux.S",
+      "gen/crypto/chacha20_poly1305_armv8-apple.S",
+      "gen/crypto/chacha20_poly1305_armv8-linux.S",
+      "gen/crypto/chacha20_poly1305_armv8-win.S",
+      "gen/crypto/chacha20_poly1305_x86_64-apple.S",
+      "gen/crypto/chacha20_poly1305_x86_64-linux.S",
+      "third_party/fiat/asm/fiat_curve25519_adx_mul.S",
+      "third_party/fiat/asm/fiat_curve25519_adx_square.S",
+      "third_party/fiat/asm/fiat_p256_adx_mul.S",
+      "third_party/fiat/asm/fiat_p256_adx_sqr.S"
+    ],
+    "nasm": [
+      "gen/crypto/aes128gcmsiv-x86_64-win.asm",
+      "gen/crypto/chacha-x86-win.asm",
+      "gen/crypto/chacha-x86_64-win.asm",
+      "gen/crypto/chacha20_poly1305_x86_64-win.asm"
+    ]
+  },
+  "test_support": {
+    "asm": [
+      "gen/test_support/trampoline-armv4-linux.S",
+      "gen/test_support/trampoline-armv8-apple.S",
+      "gen/test_support/trampoline-armv8-linux.S",
+      "gen/test_support/trampoline-armv8-win.S",
+      "gen/test_support/trampoline-x86-apple.S",
+      "gen/test_support/trampoline-x86-linux.S",
+      "gen/test_support/trampoline-x86_64-apple.S",
+      "gen/test_support/trampoline-x86_64-linux.S"
+    ],
+    "nasm": [
+      "gen/test_support/trampoline-x86-win.asm",
+      "gen/test_support/trampoline-x86_64-win.asm"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/gen/test_support/trampoline-armv4-linux.S b/gen/test_support/trampoline-armv4-linux.S
new file mode 100644
index 0000000..34a2819
--- /dev/null
+++ b/gen/test_support/trampoline-armv4-linux.S
@@ -0,0 +1,368 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+.syntax	unified
+
+.arch	armv7-a
+.fpu	vfp
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@                              const uint32_t *argv, size_t argc,
+@                              int unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+	@ Save parameters and all callee-saved registers. For convenience, we
+	@ save r9 on iOS even though it's volatile.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	stmdb	sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+	@ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+	@ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+	sub	sp, sp, #28
+
+	@ Every register in AAPCS is either non-volatile or a parameter (except
+	@ r9 on iOS), so this code, by the actual call, loses all its scratch
+	@ registers. First fill in stack parameters while there are registers
+	@ to spare.
+	cmp	r3, #4
+	bls	.Lstack_args_done
+	mov	r4, sp				@ r4 is the output pointer.
+	add	r5, r2, r3, lsl #2	@ Set r5 to the end of argv.
+	add	r2, r2, #16		@ Skip four arguments.
+.Lstack_args_loop:
+	ldr	r6, [r2], #4
+	cmp	r2, r5
+	str	r6, [r4], #4
+	bne	.Lstack_args_loop
+
+.Lstack_args_done:
+	@ Load registers from |r1|.
+	vldmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	ldmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	ldmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Load register parameters. This uses up our remaining registers, so we
+	@ repurpose lr as scratch space.
+	ldr	r3, [sp, #40]	@ Reload argc.
+	ldr	lr, [sp, #36]		@ .Load argv into lr.
+	cmp	r3, #3
+	bhi	.Larg_r3
+	beq	.Larg_r2
+	cmp	r3, #1
+	bhi	.Larg_r1
+	beq	.Larg_r0
+	b	.Largs_done
+
+.Larg_r3:
+	ldr	r3, [lr, #12]	@ argv[3]
+.Larg_r2:
+	ldr	r2, [lr, #8]	@ argv[2]
+.Larg_r1:
+	ldr	r1, [lr, #4]	@ argv[1]
+.Larg_r0:
+	ldr	r0, [lr]	@ argv[0]
+.Largs_done:
+
+	@ With every other register in use, load the function pointer into lr
+	@ and call the function.
+	ldr	lr, [sp, #28]
+	blx	lr
+
+	@ r1-r3 are free for use again. The trampoline only supports
+	@ single-return functions. Pass r4-r11 to the caller.
+	ldr	r1, [sp, #32]
+	vstmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	stmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	stmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Unwind the stack and restore registers.
+	add	sp, sp, #44		@ 44 = 28+16
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}	@ Skip r0-r3 (see +16 above).
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	bx	lr
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_r0, %function
+.globl	abi_test_clobber_r0
+.hidden	abi_test_clobber_r0
+.align	4
+abi_test_clobber_r0:
+	mov	r0, #0
+	bx	lr
+.size	abi_test_clobber_r0,.-abi_test_clobber_r0
+.type	abi_test_clobber_r1, %function
+.globl	abi_test_clobber_r1
+.hidden	abi_test_clobber_r1
+.align	4
+abi_test_clobber_r1:
+	mov	r1, #0
+	bx	lr
+.size	abi_test_clobber_r1,.-abi_test_clobber_r1
+.type	abi_test_clobber_r2, %function
+.globl	abi_test_clobber_r2
+.hidden	abi_test_clobber_r2
+.align	4
+abi_test_clobber_r2:
+	mov	r2, #0
+	bx	lr
+.size	abi_test_clobber_r2,.-abi_test_clobber_r2
+.type	abi_test_clobber_r3, %function
+.globl	abi_test_clobber_r3
+.hidden	abi_test_clobber_r3
+.align	4
+abi_test_clobber_r3:
+	mov	r3, #0
+	bx	lr
+.size	abi_test_clobber_r3,.-abi_test_clobber_r3
+.type	abi_test_clobber_r4, %function
+.globl	abi_test_clobber_r4
+.hidden	abi_test_clobber_r4
+.align	4
+abi_test_clobber_r4:
+	mov	r4, #0
+	bx	lr
+.size	abi_test_clobber_r4,.-abi_test_clobber_r4
+.type	abi_test_clobber_r5, %function
+.globl	abi_test_clobber_r5
+.hidden	abi_test_clobber_r5
+.align	4
+abi_test_clobber_r5:
+	mov	r5, #0
+	bx	lr
+.size	abi_test_clobber_r5,.-abi_test_clobber_r5
+.type	abi_test_clobber_r6, %function
+.globl	abi_test_clobber_r6
+.hidden	abi_test_clobber_r6
+.align	4
+abi_test_clobber_r6:
+	mov	r6, #0
+	bx	lr
+.size	abi_test_clobber_r6,.-abi_test_clobber_r6
+.type	abi_test_clobber_r7, %function
+.globl	abi_test_clobber_r7
+.hidden	abi_test_clobber_r7
+.align	4
+abi_test_clobber_r7:
+	mov	r7, #0
+	bx	lr
+.size	abi_test_clobber_r7,.-abi_test_clobber_r7
+.type	abi_test_clobber_r8, %function
+.globl	abi_test_clobber_r8
+.hidden	abi_test_clobber_r8
+.align	4
+abi_test_clobber_r8:
+	mov	r8, #0
+	bx	lr
+.size	abi_test_clobber_r8,.-abi_test_clobber_r8
+.type	abi_test_clobber_r9, %function
+.globl	abi_test_clobber_r9
+.hidden	abi_test_clobber_r9
+.align	4
+abi_test_clobber_r9:
+	mov	r9, #0
+	bx	lr
+.size	abi_test_clobber_r9,.-abi_test_clobber_r9
+.type	abi_test_clobber_r10, %function
+.globl	abi_test_clobber_r10
+.hidden	abi_test_clobber_r10
+.align	4
+abi_test_clobber_r10:
+	mov	r10, #0
+	bx	lr
+.size	abi_test_clobber_r10,.-abi_test_clobber_r10
+.type	abi_test_clobber_r11, %function
+.globl	abi_test_clobber_r11
+.hidden	abi_test_clobber_r11
+.align	4
+abi_test_clobber_r11:
+	mov	r11, #0
+	bx	lr
+.size	abi_test_clobber_r11,.-abi_test_clobber_r11
+.type	abi_test_clobber_r12, %function
+.globl	abi_test_clobber_r12
+.hidden	abi_test_clobber_r12
+.align	4
+abi_test_clobber_r12:
+	mov	r12, #0
+	bx	lr
+.size	abi_test_clobber_r12,.-abi_test_clobber_r12
+.type	abi_test_clobber_d0, %function
+.globl	abi_test_clobber_d0
+.hidden	abi_test_clobber_d0
+.align	4
+abi_test_clobber_d0:
+	mov	r0, #0
+	vmov	s0, r0
+	vmov	s1, r0
+	bx	lr
+.size	abi_test_clobber_d0,.-abi_test_clobber_d0
+.type	abi_test_clobber_d1, %function
+.globl	abi_test_clobber_d1
+.hidden	abi_test_clobber_d1
+.align	4
+abi_test_clobber_d1:
+	mov	r0, #0
+	vmov	s2, r0
+	vmov	s3, r0
+	bx	lr
+.size	abi_test_clobber_d1,.-abi_test_clobber_d1
+.type	abi_test_clobber_d2, %function
+.globl	abi_test_clobber_d2
+.hidden	abi_test_clobber_d2
+.align	4
+abi_test_clobber_d2:
+	mov	r0, #0
+	vmov	s4, r0
+	vmov	s5, r0
+	bx	lr
+.size	abi_test_clobber_d2,.-abi_test_clobber_d2
+.type	abi_test_clobber_d3, %function
+.globl	abi_test_clobber_d3
+.hidden	abi_test_clobber_d3
+.align	4
+abi_test_clobber_d3:
+	mov	r0, #0
+	vmov	s6, r0
+	vmov	s7, r0
+	bx	lr
+.size	abi_test_clobber_d3,.-abi_test_clobber_d3
+.type	abi_test_clobber_d4, %function
+.globl	abi_test_clobber_d4
+.hidden	abi_test_clobber_d4
+.align	4
+abi_test_clobber_d4:
+	mov	r0, #0
+	vmov	s8, r0
+	vmov	s9, r0
+	bx	lr
+.size	abi_test_clobber_d4,.-abi_test_clobber_d4
+.type	abi_test_clobber_d5, %function
+.globl	abi_test_clobber_d5
+.hidden	abi_test_clobber_d5
+.align	4
+abi_test_clobber_d5:
+	mov	r0, #0
+	vmov	s10, r0
+	vmov	s11, r0
+	bx	lr
+.size	abi_test_clobber_d5,.-abi_test_clobber_d5
+.type	abi_test_clobber_d6, %function
+.globl	abi_test_clobber_d6
+.hidden	abi_test_clobber_d6
+.align	4
+abi_test_clobber_d6:
+	mov	r0, #0
+	vmov	s12, r0
+	vmov	s13, r0
+	bx	lr
+.size	abi_test_clobber_d6,.-abi_test_clobber_d6
+.type	abi_test_clobber_d7, %function
+.globl	abi_test_clobber_d7
+.hidden	abi_test_clobber_d7
+.align	4
+abi_test_clobber_d7:
+	mov	r0, #0
+	vmov	s14, r0
+	vmov	s15, r0
+	bx	lr
+.size	abi_test_clobber_d7,.-abi_test_clobber_d7
+.type	abi_test_clobber_d8, %function
+.globl	abi_test_clobber_d8
+.hidden	abi_test_clobber_d8
+.align	4
+abi_test_clobber_d8:
+	mov	r0, #0
+	vmov	s16, r0
+	vmov	s17, r0
+	bx	lr
+.size	abi_test_clobber_d8,.-abi_test_clobber_d8
+.type	abi_test_clobber_d9, %function
+.globl	abi_test_clobber_d9
+.hidden	abi_test_clobber_d9
+.align	4
+abi_test_clobber_d9:
+	mov	r0, #0
+	vmov	s18, r0
+	vmov	s19, r0
+	bx	lr
+.size	abi_test_clobber_d9,.-abi_test_clobber_d9
+.type	abi_test_clobber_d10, %function
+.globl	abi_test_clobber_d10
+.hidden	abi_test_clobber_d10
+.align	4
+abi_test_clobber_d10:
+	mov	r0, #0
+	vmov	s20, r0
+	vmov	s21, r0
+	bx	lr
+.size	abi_test_clobber_d10,.-abi_test_clobber_d10
+.type	abi_test_clobber_d11, %function
+.globl	abi_test_clobber_d11
+.hidden	abi_test_clobber_d11
+.align	4
+abi_test_clobber_d11:
+	mov	r0, #0
+	vmov	s22, r0
+	vmov	s23, r0
+	bx	lr
+.size	abi_test_clobber_d11,.-abi_test_clobber_d11
+.type	abi_test_clobber_d12, %function
+.globl	abi_test_clobber_d12
+.hidden	abi_test_clobber_d12
+.align	4
+abi_test_clobber_d12:
+	mov	r0, #0
+	vmov	s24, r0
+	vmov	s25, r0
+	bx	lr
+.size	abi_test_clobber_d12,.-abi_test_clobber_d12
+.type	abi_test_clobber_d13, %function
+.globl	abi_test_clobber_d13
+.hidden	abi_test_clobber_d13
+.align	4
+abi_test_clobber_d13:
+	mov	r0, #0
+	vmov	s26, r0
+	vmov	s27, r0
+	bx	lr
+.size	abi_test_clobber_d13,.-abi_test_clobber_d13
+.type	abi_test_clobber_d14, %function
+.globl	abi_test_clobber_d14
+.hidden	abi_test_clobber_d14
+.align	4
+abi_test_clobber_d14:
+	mov	r0, #0
+	vmov	s28, r0
+	vmov	s29, r0
+	bx	lr
+.size	abi_test_clobber_d14,.-abi_test_clobber_d14
+.type	abi_test_clobber_d15, %function
+.globl	abi_test_clobber_d15
+.hidden	abi_test_clobber_d15
+.align	4
+abi_test_clobber_d15:
+	mov	r0, #0
+	vmov	s30, r0
+	vmov	s31, r0
+	bx	lr
+.size	abi_test_clobber_d15,.-abi_test_clobber_d15
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-armv8-apple.S b/gen/test_support/trampoline-armv8-apple.S
new file mode 100644
index 0000000..99055e0
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-apple.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+
+.globl	_abi_test_trampoline
+.private_extern	_abi_test_trampoline
+.align	4
+_abi_test_trampoline:
+Labi_test_trampoline_begin:
+	AARCH64_SIGN_LINK_REGISTER
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    x1 (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	x1, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [x1], #16
+	ldp	d10, d11, [x1], #16
+	ldp	d12, d13, [x1], #16
+	ldp	d14, d15, [x1], #16
+	ldp	x19, x20, [x1], #16
+	ldp	x21, x22, [x1], #16
+	ldp	x23, x24, [x1], #16
+	ldp	x25, x26, [x1], #16
+	ldp	x27, x28, [x1], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, x0
+	mov	x10, x2
+	mov	x11, x3
+
+	// Load parameters into registers.
+	cbz	x11, Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x7, [x10], #8
+
+Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	x1, [sp, #160]
+	stp	d8, d9, [x1], #16
+	stp	d10, d11, [x1], #16
+	stp	d12, d13, [x1], #16
+	stp	d14, d15, [x1], #16
+	stp	x19, x20, [x1], #16
+	stp	x21, x22, [x1], #16
+	stp	x23, x24, [x1], #16
+	stp	x25, x26, [x1], #16
+	stp	x27, x28, [x1], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	Lx29_ok
+	str	xzr, [x1]
+
+Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_abi_test_clobber_x0
+.private_extern	_abi_test_clobber_x0
+.align	4
+_abi_test_clobber_x0:
+	AARCH64_VALID_CALL_TARGET
+	mov	x0, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x1
+.private_extern	_abi_test_clobber_x1
+.align	4
+_abi_test_clobber_x1:
+	AARCH64_VALID_CALL_TARGET
+	mov	x1, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x2
+.private_extern	_abi_test_clobber_x2
+.align	4
+_abi_test_clobber_x2:
+	AARCH64_VALID_CALL_TARGET
+	mov	x2, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x3
+.private_extern	_abi_test_clobber_x3
+.align	4
+_abi_test_clobber_x3:
+	AARCH64_VALID_CALL_TARGET
+	mov	x3, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x4
+.private_extern	_abi_test_clobber_x4
+.align	4
+_abi_test_clobber_x4:
+	AARCH64_VALID_CALL_TARGET
+	mov	x4, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x5
+.private_extern	_abi_test_clobber_x5
+.align	4
+_abi_test_clobber_x5:
+	AARCH64_VALID_CALL_TARGET
+	mov	x5, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x6
+.private_extern	_abi_test_clobber_x6
+.align	4
+_abi_test_clobber_x6:
+	AARCH64_VALID_CALL_TARGET
+	mov	x6, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x7
+.private_extern	_abi_test_clobber_x7
+.align	4
+_abi_test_clobber_x7:
+	AARCH64_VALID_CALL_TARGET
+	mov	x7, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x8
+.private_extern	_abi_test_clobber_x8
+.align	4
+_abi_test_clobber_x8:
+	AARCH64_VALID_CALL_TARGET
+	mov	x8, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x9
+.private_extern	_abi_test_clobber_x9
+.align	4
+_abi_test_clobber_x9:
+	AARCH64_VALID_CALL_TARGET
+	mov	x9, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x10
+.private_extern	_abi_test_clobber_x10
+.align	4
+_abi_test_clobber_x10:
+	AARCH64_VALID_CALL_TARGET
+	mov	x10, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x11
+.private_extern	_abi_test_clobber_x11
+.align	4
+_abi_test_clobber_x11:
+	AARCH64_VALID_CALL_TARGET
+	mov	x11, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x12
+.private_extern	_abi_test_clobber_x12
+.align	4
+_abi_test_clobber_x12:
+	AARCH64_VALID_CALL_TARGET
+	mov	x12, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x13
+.private_extern	_abi_test_clobber_x13
+.align	4
+_abi_test_clobber_x13:
+	AARCH64_VALID_CALL_TARGET
+	mov	x13, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x14
+.private_extern	_abi_test_clobber_x14
+.align	4
+_abi_test_clobber_x14:
+	AARCH64_VALID_CALL_TARGET
+	mov	x14, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x15
+.private_extern	_abi_test_clobber_x15
+.align	4
+_abi_test_clobber_x15:
+	AARCH64_VALID_CALL_TARGET
+	mov	x15, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x16
+.private_extern	_abi_test_clobber_x16
+.align	4
+_abi_test_clobber_x16:
+	AARCH64_VALID_CALL_TARGET
+	mov	x16, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x17
+.private_extern	_abi_test_clobber_x17
+.align	4
+_abi_test_clobber_x17:
+	AARCH64_VALID_CALL_TARGET
+	mov	x17, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x19
+.private_extern	_abi_test_clobber_x19
+.align	4
+_abi_test_clobber_x19:
+	AARCH64_VALID_CALL_TARGET
+	mov	x19, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x20
+.private_extern	_abi_test_clobber_x20
+.align	4
+_abi_test_clobber_x20:
+	AARCH64_VALID_CALL_TARGET
+	mov	x20, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x21
+.private_extern	_abi_test_clobber_x21
+.align	4
+_abi_test_clobber_x21:
+	AARCH64_VALID_CALL_TARGET
+	mov	x21, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x22
+.private_extern	_abi_test_clobber_x22
+.align	4
+_abi_test_clobber_x22:
+	AARCH64_VALID_CALL_TARGET
+	mov	x22, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x23
+.private_extern	_abi_test_clobber_x23
+.align	4
+_abi_test_clobber_x23:
+	AARCH64_VALID_CALL_TARGET
+	mov	x23, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x24
+.private_extern	_abi_test_clobber_x24
+.align	4
+_abi_test_clobber_x24:
+	AARCH64_VALID_CALL_TARGET
+	mov	x24, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x25
+.private_extern	_abi_test_clobber_x25
+.align	4
+_abi_test_clobber_x25:
+	AARCH64_VALID_CALL_TARGET
+	mov	x25, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x26
+.private_extern	_abi_test_clobber_x26
+.align	4
+_abi_test_clobber_x26:
+	AARCH64_VALID_CALL_TARGET
+	mov	x26, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x27
+.private_extern	_abi_test_clobber_x27
+.align	4
+_abi_test_clobber_x27:
+	AARCH64_VALID_CALL_TARGET
+	mov	x27, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x28
+.private_extern	_abi_test_clobber_x28
+.align	4
+_abi_test_clobber_x28:
+	AARCH64_VALID_CALL_TARGET
+	mov	x28, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x29
+.private_extern	_abi_test_clobber_x29
+.align	4
+_abi_test_clobber_x29:
+	AARCH64_VALID_CALL_TARGET
+	mov	x29, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d0
+.private_extern	_abi_test_clobber_d0
+.align	4
+_abi_test_clobber_d0:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d0, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d1
+.private_extern	_abi_test_clobber_d1
+.align	4
+_abi_test_clobber_d1:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d1, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d2
+.private_extern	_abi_test_clobber_d2
+.align	4
+_abi_test_clobber_d2:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d2, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d3
+.private_extern	_abi_test_clobber_d3
+.align	4
+_abi_test_clobber_d3:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d3, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d4
+.private_extern	_abi_test_clobber_d4
+.align	4
+_abi_test_clobber_d4:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d4, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d5
+.private_extern	_abi_test_clobber_d5
+.align	4
+_abi_test_clobber_d5:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d5, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d6
+.private_extern	_abi_test_clobber_d6
+.align	4
+_abi_test_clobber_d6:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d6, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d7
+.private_extern	_abi_test_clobber_d7
+.align	4
+_abi_test_clobber_d7:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d7, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d8
+.private_extern	_abi_test_clobber_d8
+.align	4
+_abi_test_clobber_d8:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d8, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d9
+.private_extern	_abi_test_clobber_d9
+.align	4
+_abi_test_clobber_d9:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d9, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d10
+.private_extern	_abi_test_clobber_d10
+.align	4
+_abi_test_clobber_d10:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d10, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d11
+.private_extern	_abi_test_clobber_d11
+.align	4
+_abi_test_clobber_d11:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d11, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d12
+.private_extern	_abi_test_clobber_d12
+.align	4
+_abi_test_clobber_d12:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d12, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d13
+.private_extern	_abi_test_clobber_d13
+.align	4
+_abi_test_clobber_d13:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d13, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d14
+.private_extern	_abi_test_clobber_d14
+.align	4
+_abi_test_clobber_d14:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d14, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d15
+.private_extern	_abi_test_clobber_d15
+.align	4
+_abi_test_clobber_d15:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d15, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d16
+.private_extern	_abi_test_clobber_d16
+.align	4
+_abi_test_clobber_d16:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d16, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d17
+.private_extern	_abi_test_clobber_d17
+.align	4
+_abi_test_clobber_d17:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d17, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d18
+.private_extern	_abi_test_clobber_d18
+.align	4
+_abi_test_clobber_d18:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d18, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d19
+.private_extern	_abi_test_clobber_d19
+.align	4
+_abi_test_clobber_d19:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d19, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d20
+.private_extern	_abi_test_clobber_d20
+.align	4
+_abi_test_clobber_d20:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d20, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d21
+.private_extern	_abi_test_clobber_d21
+.align	4
+_abi_test_clobber_d21:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d21, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d22
+.private_extern	_abi_test_clobber_d22
+.align	4
+_abi_test_clobber_d22:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d22, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d23
+.private_extern	_abi_test_clobber_d23
+.align	4
+_abi_test_clobber_d23:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d23, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d24
+.private_extern	_abi_test_clobber_d24
+.align	4
+_abi_test_clobber_d24:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d24, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d25
+.private_extern	_abi_test_clobber_d25
+.align	4
+_abi_test_clobber_d25:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d25, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d26
+.private_extern	_abi_test_clobber_d26
+.align	4
+_abi_test_clobber_d26:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d26, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d27
+.private_extern	_abi_test_clobber_d27
+.align	4
+_abi_test_clobber_d27:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d27, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d28
+.private_extern	_abi_test_clobber_d28
+.align	4
+_abi_test_clobber_d28:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d28, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d29
+.private_extern	_abi_test_clobber_d29
+.align	4
+_abi_test_clobber_d29:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d29, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d30
+.private_extern	_abi_test_clobber_d30
+.align	4
+_abi_test_clobber_d30:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d30, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d31
+.private_extern	_abi_test_clobber_d31
+.align	4
+_abi_test_clobber_d31:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d31, xzr
+	ret
+
+
+.globl	_abi_test_clobber_v8_upper
+.private_extern	_abi_test_clobber_v8_upper
+.align	4
+_abi_test_clobber_v8_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v8.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v9_upper
+.private_extern	_abi_test_clobber_v9_upper
+.align	4
+_abi_test_clobber_v9_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v9.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v10_upper
+.private_extern	_abi_test_clobber_v10_upper
+.align	4
+_abi_test_clobber_v10_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v10.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v11_upper
+.private_extern	_abi_test_clobber_v11_upper
+.align	4
+_abi_test_clobber_v11_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v11.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v12_upper
+.private_extern	_abi_test_clobber_v12_upper
+.align	4
+_abi_test_clobber_v12_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v12.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v13_upper
+.private_extern	_abi_test_clobber_v13_upper
+.align	4
+_abi_test_clobber_v13_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v13.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v14_upper
+.private_extern	_abi_test_clobber_v14_upper
+.align	4
+_abi_test_clobber_v14_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v14.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v15_upper
+.private_extern	_abi_test_clobber_v15_upper
+.align	4
+_abi_test_clobber_v15_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v15.d[1], xzr
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/test_support/trampoline-armv8-linux.S b/gen/test_support/trampoline-armv8-linux.S
new file mode 100644
index 0000000..58b4b93
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-linux.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+	AARCH64_SIGN_LINK_REGISTER
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    x1 (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	x1, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [x1], #16
+	ldp	d10, d11, [x1], #16
+	ldp	d12, d13, [x1], #16
+	ldp	d14, d15, [x1], #16
+	ldp	x19, x20, [x1], #16
+	ldp	x21, x22, [x1], #16
+	ldp	x23, x24, [x1], #16
+	ldp	x25, x26, [x1], #16
+	ldp	x27, x28, [x1], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, x0
+	mov	x10, x2
+	mov	x11, x3
+
+	// Load parameters into registers.
+	cbz	x11, .Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x7, [x10], #8
+
+.Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	x1, [sp, #160]
+	stp	d8, d9, [x1], #16
+	stp	d10, d11, [x1], #16
+	stp	d12, d13, [x1], #16
+	stp	d14, d15, [x1], #16
+	stp	x19, x20, [x1], #16
+	stp	x21, x22, [x1], #16
+	stp	x23, x24, [x1], #16
+	stp	x25, x26, [x1], #16
+	stp	x27, x28, [x1], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	.Lx29_ok
+	str	xzr, [x1]
+
+.Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_x0, %function
+.globl	abi_test_clobber_x0
+.hidden	abi_test_clobber_x0
+.align	4
+abi_test_clobber_x0:
+	AARCH64_VALID_CALL_TARGET
+	mov	x0, xzr
+	ret
+.size	abi_test_clobber_x0,.-abi_test_clobber_x0
+.type	abi_test_clobber_x1, %function
+.globl	abi_test_clobber_x1
+.hidden	abi_test_clobber_x1
+.align	4
+abi_test_clobber_x1:
+	AARCH64_VALID_CALL_TARGET
+	mov	x1, xzr
+	ret
+.size	abi_test_clobber_x1,.-abi_test_clobber_x1
+.type	abi_test_clobber_x2, %function
+.globl	abi_test_clobber_x2
+.hidden	abi_test_clobber_x2
+.align	4
+abi_test_clobber_x2:
+	AARCH64_VALID_CALL_TARGET
+	mov	x2, xzr
+	ret
+.size	abi_test_clobber_x2,.-abi_test_clobber_x2
+.type	abi_test_clobber_x3, %function
+.globl	abi_test_clobber_x3
+.hidden	abi_test_clobber_x3
+.align	4
+abi_test_clobber_x3:
+	AARCH64_VALID_CALL_TARGET
+	mov	x3, xzr
+	ret
+.size	abi_test_clobber_x3,.-abi_test_clobber_x3
+.type	abi_test_clobber_x4, %function
+.globl	abi_test_clobber_x4
+.hidden	abi_test_clobber_x4
+.align	4
+abi_test_clobber_x4:
+	AARCH64_VALID_CALL_TARGET
+	mov	x4, xzr
+	ret
+.size	abi_test_clobber_x4,.-abi_test_clobber_x4
+.type	abi_test_clobber_x5, %function
+.globl	abi_test_clobber_x5
+.hidden	abi_test_clobber_x5
+.align	4
+abi_test_clobber_x5:
+	AARCH64_VALID_CALL_TARGET
+	mov	x5, xzr
+	ret
+.size	abi_test_clobber_x5,.-abi_test_clobber_x5
+.type	abi_test_clobber_x6, %function
+.globl	abi_test_clobber_x6
+.hidden	abi_test_clobber_x6
+.align	4
+abi_test_clobber_x6:
+	AARCH64_VALID_CALL_TARGET
+	mov	x6, xzr
+	ret
+.size	abi_test_clobber_x6,.-abi_test_clobber_x6
+.type	abi_test_clobber_x7, %function
+.globl	abi_test_clobber_x7
+.hidden	abi_test_clobber_x7
+.align	4
+abi_test_clobber_x7:
+	AARCH64_VALID_CALL_TARGET
+	mov	x7, xzr
+	ret
+.size	abi_test_clobber_x7,.-abi_test_clobber_x7
+.type	abi_test_clobber_x8, %function
+.globl	abi_test_clobber_x8
+.hidden	abi_test_clobber_x8
+.align	4
+abi_test_clobber_x8:
+	AARCH64_VALID_CALL_TARGET
+	mov	x8, xzr
+	ret
+.size	abi_test_clobber_x8,.-abi_test_clobber_x8
+.type	abi_test_clobber_x9, %function
+.globl	abi_test_clobber_x9
+.hidden	abi_test_clobber_x9
+.align	4
+abi_test_clobber_x9:
+	AARCH64_VALID_CALL_TARGET
+	mov	x9, xzr
+	ret
+.size	abi_test_clobber_x9,.-abi_test_clobber_x9
+.type	abi_test_clobber_x10, %function
+.globl	abi_test_clobber_x10
+.hidden	abi_test_clobber_x10
+.align	4
+abi_test_clobber_x10:
+	AARCH64_VALID_CALL_TARGET
+	mov	x10, xzr
+	ret
+.size	abi_test_clobber_x10,.-abi_test_clobber_x10
+.type	abi_test_clobber_x11, %function
+.globl	abi_test_clobber_x11
+.hidden	abi_test_clobber_x11
+.align	4
+abi_test_clobber_x11:
+	AARCH64_VALID_CALL_TARGET
+	mov	x11, xzr
+	ret
+.size	abi_test_clobber_x11,.-abi_test_clobber_x11
+.type	abi_test_clobber_x12, %function
+.globl	abi_test_clobber_x12
+.hidden	abi_test_clobber_x12
+.align	4
+abi_test_clobber_x12:
+	AARCH64_VALID_CALL_TARGET
+	mov	x12, xzr
+	ret
+.size	abi_test_clobber_x12,.-abi_test_clobber_x12
+.type	abi_test_clobber_x13, %function
+.globl	abi_test_clobber_x13
+.hidden	abi_test_clobber_x13
+.align	4
+abi_test_clobber_x13:
+	AARCH64_VALID_CALL_TARGET
+	mov	x13, xzr
+	ret
+.size	abi_test_clobber_x13,.-abi_test_clobber_x13
+.type	abi_test_clobber_x14, %function
+.globl	abi_test_clobber_x14
+.hidden	abi_test_clobber_x14
+.align	4
+abi_test_clobber_x14:
+	AARCH64_VALID_CALL_TARGET
+	mov	x14, xzr
+	ret
+.size	abi_test_clobber_x14,.-abi_test_clobber_x14
+.type	abi_test_clobber_x15, %function
+.globl	abi_test_clobber_x15
+.hidden	abi_test_clobber_x15
+.align	4
+abi_test_clobber_x15:
+	AARCH64_VALID_CALL_TARGET
+	mov	x15, xzr
+	ret
+.size	abi_test_clobber_x15,.-abi_test_clobber_x15
+.type	abi_test_clobber_x16, %function
+.globl	abi_test_clobber_x16
+.hidden	abi_test_clobber_x16
+.align	4
+abi_test_clobber_x16:
+	AARCH64_VALID_CALL_TARGET
+	mov	x16, xzr
+	ret
+.size	abi_test_clobber_x16,.-abi_test_clobber_x16
+.type	abi_test_clobber_x17, %function
+.globl	abi_test_clobber_x17
+.hidden	abi_test_clobber_x17
+.align	4
+abi_test_clobber_x17:
+	AARCH64_VALID_CALL_TARGET
+	mov	x17, xzr
+	ret
+.size	abi_test_clobber_x17,.-abi_test_clobber_x17
+.type	abi_test_clobber_x19, %function
+.globl	abi_test_clobber_x19
+.hidden	abi_test_clobber_x19
+.align	4
+abi_test_clobber_x19:
+	AARCH64_VALID_CALL_TARGET
+	mov	x19, xzr
+	ret
+.size	abi_test_clobber_x19,.-abi_test_clobber_x19
+.type	abi_test_clobber_x20, %function
+.globl	abi_test_clobber_x20
+.hidden	abi_test_clobber_x20
+.align	4
+abi_test_clobber_x20:
+	AARCH64_VALID_CALL_TARGET
+	mov	x20, xzr
+	ret
+.size	abi_test_clobber_x20,.-abi_test_clobber_x20
+.type	abi_test_clobber_x21, %function
+.globl	abi_test_clobber_x21
+.hidden	abi_test_clobber_x21
+.align	4
+abi_test_clobber_x21:
+	AARCH64_VALID_CALL_TARGET
+	mov	x21, xzr
+	ret
+.size	abi_test_clobber_x21,.-abi_test_clobber_x21
+.type	abi_test_clobber_x22, %function
+.globl	abi_test_clobber_x22
+.hidden	abi_test_clobber_x22
+.align	4
+abi_test_clobber_x22:
+	AARCH64_VALID_CALL_TARGET
+	mov	x22, xzr
+	ret
+.size	abi_test_clobber_x22,.-abi_test_clobber_x22
+.type	abi_test_clobber_x23, %function
+.globl	abi_test_clobber_x23
+.hidden	abi_test_clobber_x23
+.align	4
+abi_test_clobber_x23:
+	AARCH64_VALID_CALL_TARGET
+	mov	x23, xzr
+	ret
+.size	abi_test_clobber_x23,.-abi_test_clobber_x23
+.type	abi_test_clobber_x24, %function
+.globl	abi_test_clobber_x24
+.hidden	abi_test_clobber_x24
+.align	4
+abi_test_clobber_x24:
+	AARCH64_VALID_CALL_TARGET
+	mov	x24, xzr
+	ret
+.size	abi_test_clobber_x24,.-abi_test_clobber_x24
+.type	abi_test_clobber_x25, %function
+.globl	abi_test_clobber_x25
+.hidden	abi_test_clobber_x25
+.align	4
+abi_test_clobber_x25:
+	AARCH64_VALID_CALL_TARGET
+	mov	x25, xzr
+	ret
+.size	abi_test_clobber_x25,.-abi_test_clobber_x25
+.type	abi_test_clobber_x26, %function
+.globl	abi_test_clobber_x26
+.hidden	abi_test_clobber_x26
+.align	4
+abi_test_clobber_x26:
+	AARCH64_VALID_CALL_TARGET
+	mov	x26, xzr
+	ret
+.size	abi_test_clobber_x26,.-abi_test_clobber_x26
+.type	abi_test_clobber_x27, %function
+.globl	abi_test_clobber_x27
+.hidden	abi_test_clobber_x27
+.align	4
+abi_test_clobber_x27:
+	AARCH64_VALID_CALL_TARGET
+	mov	x27, xzr
+	ret
+.size	abi_test_clobber_x27,.-abi_test_clobber_x27
+.type	abi_test_clobber_x28, %function
+.globl	abi_test_clobber_x28
+.hidden	abi_test_clobber_x28
+.align	4
+abi_test_clobber_x28:
+	AARCH64_VALID_CALL_TARGET
+	mov	x28, xzr
+	ret
+.size	abi_test_clobber_x28,.-abi_test_clobber_x28
+.type	abi_test_clobber_x29, %function
+.globl	abi_test_clobber_x29
+.hidden	abi_test_clobber_x29
+.align	4
+abi_test_clobber_x29:
+	AARCH64_VALID_CALL_TARGET
+	mov	x29, xzr
+	ret
+.size	abi_test_clobber_x29,.-abi_test_clobber_x29
+.type	abi_test_clobber_d0, %function
+.globl	abi_test_clobber_d0
+.hidden	abi_test_clobber_d0
+.align	4
+abi_test_clobber_d0:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d0, xzr
+	ret
+.size	abi_test_clobber_d0,.-abi_test_clobber_d0
+.type	abi_test_clobber_d1, %function
+.globl	abi_test_clobber_d1
+.hidden	abi_test_clobber_d1
+.align	4
+abi_test_clobber_d1:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d1, xzr
+	ret
+.size	abi_test_clobber_d1,.-abi_test_clobber_d1
+.type	abi_test_clobber_d2, %function
+.globl	abi_test_clobber_d2
+.hidden	abi_test_clobber_d2
+.align	4
+abi_test_clobber_d2:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d2, xzr
+	ret
+.size	abi_test_clobber_d2,.-abi_test_clobber_d2
+.type	abi_test_clobber_d3, %function
+.globl	abi_test_clobber_d3
+.hidden	abi_test_clobber_d3
+.align	4
+abi_test_clobber_d3:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d3, xzr
+	ret
+.size	abi_test_clobber_d3,.-abi_test_clobber_d3
+.type	abi_test_clobber_d4, %function
+.globl	abi_test_clobber_d4
+.hidden	abi_test_clobber_d4
+.align	4
+abi_test_clobber_d4:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d4, xzr
+	ret
+.size	abi_test_clobber_d4,.-abi_test_clobber_d4
+.type	abi_test_clobber_d5, %function
+.globl	abi_test_clobber_d5
+.hidden	abi_test_clobber_d5
+.align	4
+abi_test_clobber_d5:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d5, xzr
+	ret
+.size	abi_test_clobber_d5,.-abi_test_clobber_d5
+.type	abi_test_clobber_d6, %function
+.globl	abi_test_clobber_d6
+.hidden	abi_test_clobber_d6
+.align	4
+abi_test_clobber_d6:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d6, xzr
+	ret
+.size	abi_test_clobber_d6,.-abi_test_clobber_d6
+.type	abi_test_clobber_d7, %function
+.globl	abi_test_clobber_d7
+.hidden	abi_test_clobber_d7
+.align	4
+abi_test_clobber_d7:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d7, xzr
+	ret
+.size	abi_test_clobber_d7,.-abi_test_clobber_d7
+.type	abi_test_clobber_d8, %function
+.globl	abi_test_clobber_d8
+.hidden	abi_test_clobber_d8
+.align	4
+abi_test_clobber_d8:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d8, xzr
+	ret
+.size	abi_test_clobber_d8,.-abi_test_clobber_d8
+.type	abi_test_clobber_d9, %function
+.globl	abi_test_clobber_d9
+.hidden	abi_test_clobber_d9
+.align	4
+abi_test_clobber_d9:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d9, xzr
+	ret
+.size	abi_test_clobber_d9,.-abi_test_clobber_d9
+.type	abi_test_clobber_d10, %function
+.globl	abi_test_clobber_d10
+.hidden	abi_test_clobber_d10
+.align	4
+abi_test_clobber_d10:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d10, xzr
+	ret
+.size	abi_test_clobber_d10,.-abi_test_clobber_d10
+.type	abi_test_clobber_d11, %function
+.globl	abi_test_clobber_d11
+.hidden	abi_test_clobber_d11
+.align	4
+abi_test_clobber_d11:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d11, xzr
+	ret
+.size	abi_test_clobber_d11,.-abi_test_clobber_d11
+.type	abi_test_clobber_d12, %function
+.globl	abi_test_clobber_d12
+.hidden	abi_test_clobber_d12
+.align	4
+abi_test_clobber_d12:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d12, xzr
+	ret
+.size	abi_test_clobber_d12,.-abi_test_clobber_d12
+.type	abi_test_clobber_d13, %function
+.globl	abi_test_clobber_d13
+.hidden	abi_test_clobber_d13
+.align	4
+abi_test_clobber_d13:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d13, xzr
+	ret
+.size	abi_test_clobber_d13,.-abi_test_clobber_d13
+.type	abi_test_clobber_d14, %function
+.globl	abi_test_clobber_d14
+.hidden	abi_test_clobber_d14
+.align	4
+abi_test_clobber_d14:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d14, xzr
+	ret
+.size	abi_test_clobber_d14,.-abi_test_clobber_d14
+.type	abi_test_clobber_d15, %function
+.globl	abi_test_clobber_d15
+.hidden	abi_test_clobber_d15
+.align	4
+abi_test_clobber_d15:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d15, xzr
+	ret
+.size	abi_test_clobber_d15,.-abi_test_clobber_d15
+.type	abi_test_clobber_d16, %function
+.globl	abi_test_clobber_d16
+.hidden	abi_test_clobber_d16
+.align	4
+abi_test_clobber_d16:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d16, xzr
+	ret
+.size	abi_test_clobber_d16,.-abi_test_clobber_d16
+.type	abi_test_clobber_d17, %function
+.globl	abi_test_clobber_d17
+.hidden	abi_test_clobber_d17
+.align	4
+abi_test_clobber_d17:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d17, xzr
+	ret
+.size	abi_test_clobber_d17,.-abi_test_clobber_d17
+.type	abi_test_clobber_d18, %function
+.globl	abi_test_clobber_d18
+.hidden	abi_test_clobber_d18
+.align	4
+abi_test_clobber_d18:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d18, xzr
+	ret
+.size	abi_test_clobber_d18,.-abi_test_clobber_d18
+.type	abi_test_clobber_d19, %function
+.globl	abi_test_clobber_d19
+.hidden	abi_test_clobber_d19
+.align	4
+abi_test_clobber_d19:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d19, xzr
+	ret
+.size	abi_test_clobber_d19,.-abi_test_clobber_d19
+.type	abi_test_clobber_d20, %function
+.globl	abi_test_clobber_d20
+.hidden	abi_test_clobber_d20
+.align	4
+abi_test_clobber_d20:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d20, xzr
+	ret
+.size	abi_test_clobber_d20,.-abi_test_clobber_d20
+.type	abi_test_clobber_d21, %function
+.globl	abi_test_clobber_d21
+.hidden	abi_test_clobber_d21
+.align	4
+abi_test_clobber_d21:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d21, xzr
+	ret
+.size	abi_test_clobber_d21,.-abi_test_clobber_d21
+.type	abi_test_clobber_d22, %function
+.globl	abi_test_clobber_d22
+.hidden	abi_test_clobber_d22
+.align	4
+abi_test_clobber_d22:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d22, xzr
+	ret
+.size	abi_test_clobber_d22,.-abi_test_clobber_d22
+.type	abi_test_clobber_d23, %function
+.globl	abi_test_clobber_d23
+.hidden	abi_test_clobber_d23
+.align	4
+abi_test_clobber_d23:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d23, xzr
+	ret
+.size	abi_test_clobber_d23,.-abi_test_clobber_d23
+.type	abi_test_clobber_d24, %function
+.globl	abi_test_clobber_d24
+.hidden	abi_test_clobber_d24
+.align	4
+abi_test_clobber_d24:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d24, xzr
+	ret
+.size	abi_test_clobber_d24,.-abi_test_clobber_d24
+.type	abi_test_clobber_d25, %function
+.globl	abi_test_clobber_d25
+.hidden	abi_test_clobber_d25
+.align	4
+abi_test_clobber_d25:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d25, xzr
+	ret
+.size	abi_test_clobber_d25,.-abi_test_clobber_d25
+.type	abi_test_clobber_d26, %function
+.globl	abi_test_clobber_d26
+.hidden	abi_test_clobber_d26
+.align	4
+abi_test_clobber_d26:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d26, xzr
+	ret
+.size	abi_test_clobber_d26,.-abi_test_clobber_d26
+.type	abi_test_clobber_d27, %function
+.globl	abi_test_clobber_d27
+.hidden	abi_test_clobber_d27
+.align	4
+abi_test_clobber_d27:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d27, xzr
+	ret
+.size	abi_test_clobber_d27,.-abi_test_clobber_d27
+.type	abi_test_clobber_d28, %function
+.globl	abi_test_clobber_d28
+.hidden	abi_test_clobber_d28
+.align	4
+abi_test_clobber_d28:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d28, xzr
+	ret
+.size	abi_test_clobber_d28,.-abi_test_clobber_d28
+.type	abi_test_clobber_d29, %function
+.globl	abi_test_clobber_d29
+.hidden	abi_test_clobber_d29
+.align	4
+abi_test_clobber_d29:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d29, xzr
+	ret
+.size	abi_test_clobber_d29,.-abi_test_clobber_d29
+.type	abi_test_clobber_d30, %function
+.globl	abi_test_clobber_d30
+.hidden	abi_test_clobber_d30
+.align	4
+abi_test_clobber_d30:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d30, xzr
+	ret
+.size	abi_test_clobber_d30,.-abi_test_clobber_d30
+.type	abi_test_clobber_d31, %function
+.globl	abi_test_clobber_d31
+.hidden	abi_test_clobber_d31
+.align	4
+abi_test_clobber_d31:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d31, xzr
+	ret
+.size	abi_test_clobber_d31,.-abi_test_clobber_d31
+.type	abi_test_clobber_v8_upper, %function
+.globl	abi_test_clobber_v8_upper
+.hidden	abi_test_clobber_v8_upper
+.align	4
+abi_test_clobber_v8_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v8.d[1], xzr
+	ret
+.size	abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper
+.type	abi_test_clobber_v9_upper, %function
+.globl	abi_test_clobber_v9_upper
+.hidden	abi_test_clobber_v9_upper
+.align	4
+abi_test_clobber_v9_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v9.d[1], xzr
+	ret
+.size	abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper
+.type	abi_test_clobber_v10_upper, %function
+.globl	abi_test_clobber_v10_upper
+.hidden	abi_test_clobber_v10_upper
+.align	4
+abi_test_clobber_v10_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v10.d[1], xzr
+	ret
+.size	abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper
+.type	abi_test_clobber_v11_upper, %function
+.globl	abi_test_clobber_v11_upper
+.hidden	abi_test_clobber_v11_upper
+.align	4
+abi_test_clobber_v11_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v11.d[1], xzr
+	ret
+.size	abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper
+.type	abi_test_clobber_v12_upper, %function
+.globl	abi_test_clobber_v12_upper
+.hidden	abi_test_clobber_v12_upper
+.align	4
+abi_test_clobber_v12_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v12.d[1], xzr
+	ret
+.size	abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper
+.type	abi_test_clobber_v13_upper, %function
+.globl	abi_test_clobber_v13_upper
+.hidden	abi_test_clobber_v13_upper
+.align	4
+abi_test_clobber_v13_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v13.d[1], xzr
+	ret
+.size	abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper
+.type	abi_test_clobber_v14_upper, %function
+.globl	abi_test_clobber_v14_upper
+.hidden	abi_test_clobber_v14_upper
+.align	4
+abi_test_clobber_v14_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v14.d[1], xzr
+	ret
+.size	abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper
+.type	abi_test_clobber_v15_upper, %function
+.globl	abi_test_clobber_v15_upper
+.hidden	abi_test_clobber_v15_upper
+.align	4
+abi_test_clobber_v15_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v15.d[1], xzr
+	ret
+.size	abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-armv8-win.S b/gen/test_support/trampoline-armv8-win.S
new file mode 100644
index 0000000..14773e3
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-win.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+
+.globl	abi_test_trampoline
+
+.align	4
+abi_test_trampoline:
+Labi_test_trampoline_begin:
+	AARCH64_SIGN_LINK_REGISTER
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    x1 (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	x1, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [x1], #16
+	ldp	d10, d11, [x1], #16
+	ldp	d12, d13, [x1], #16
+	ldp	d14, d15, [x1], #16
+	ldp	x19, x20, [x1], #16
+	ldp	x21, x22, [x1], #16
+	ldp	x23, x24, [x1], #16
+	ldp	x25, x26, [x1], #16
+	ldp	x27, x28, [x1], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, x0
+	mov	x10, x2
+	mov	x11, x3
+
+	// Load parameters into registers.
+	cbz	x11, Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x7, [x10], #8
+
+Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	x1, [sp, #160]
+	stp	d8, d9, [x1], #16
+	stp	d10, d11, [x1], #16
+	stp	d12, d13, [x1], #16
+	stp	d14, d15, [x1], #16
+	stp	x19, x20, [x1], #16
+	stp	x21, x22, [x1], #16
+	stp	x23, x24, [x1], #16
+	stp	x25, x26, [x1], #16
+	stp	x27, x28, [x1], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	Lx29_ok
+	str	xzr, [x1]
+
+Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	abi_test_clobber_x0
+
+.align	4
+abi_test_clobber_x0:
+	AARCH64_VALID_CALL_TARGET
+	mov	x0, xzr
+	ret
+
+
+.globl	abi_test_clobber_x1
+
+.align	4
+abi_test_clobber_x1:
+	AARCH64_VALID_CALL_TARGET
+	mov	x1, xzr
+	ret
+
+
+.globl	abi_test_clobber_x2
+
+.align	4
+abi_test_clobber_x2:
+	AARCH64_VALID_CALL_TARGET
+	mov	x2, xzr
+	ret
+
+
+.globl	abi_test_clobber_x3
+
+.align	4
+abi_test_clobber_x3:
+	AARCH64_VALID_CALL_TARGET
+	mov	x3, xzr
+	ret
+
+
+.globl	abi_test_clobber_x4
+
+.align	4
+abi_test_clobber_x4:
+	AARCH64_VALID_CALL_TARGET
+	mov	x4, xzr
+	ret
+
+
+.globl	abi_test_clobber_x5
+
+.align	4
+abi_test_clobber_x5:
+	AARCH64_VALID_CALL_TARGET
+	mov	x5, xzr
+	ret
+
+
+.globl	abi_test_clobber_x6
+
+.align	4
+abi_test_clobber_x6:
+	AARCH64_VALID_CALL_TARGET
+	mov	x6, xzr
+	ret
+
+
+.globl	abi_test_clobber_x7
+
+.align	4
+abi_test_clobber_x7:
+	AARCH64_VALID_CALL_TARGET
+	mov	x7, xzr
+	ret
+
+
+.globl	abi_test_clobber_x8
+
+.align	4
+abi_test_clobber_x8:
+	AARCH64_VALID_CALL_TARGET
+	mov	x8, xzr
+	ret
+
+
+.globl	abi_test_clobber_x9
+
+.align	4
+abi_test_clobber_x9:
+	AARCH64_VALID_CALL_TARGET
+	mov	x9, xzr
+	ret
+
+
+.globl	abi_test_clobber_x10
+
+.align	4
+abi_test_clobber_x10:
+	AARCH64_VALID_CALL_TARGET
+	mov	x10, xzr
+	ret
+
+
+.globl	abi_test_clobber_x11
+
+.align	4
+abi_test_clobber_x11:
+	AARCH64_VALID_CALL_TARGET
+	mov	x11, xzr
+	ret
+
+
+.globl	abi_test_clobber_x12
+
+.align	4
+abi_test_clobber_x12:
+	AARCH64_VALID_CALL_TARGET
+	mov	x12, xzr
+	ret
+
+
+.globl	abi_test_clobber_x13
+
+.align	4
+abi_test_clobber_x13:
+	AARCH64_VALID_CALL_TARGET
+	mov	x13, xzr
+	ret
+
+
+.globl	abi_test_clobber_x14
+
+.align	4
+abi_test_clobber_x14:
+	AARCH64_VALID_CALL_TARGET
+	mov	x14, xzr
+	ret
+
+
+.globl	abi_test_clobber_x15
+
+.align	4
+abi_test_clobber_x15:
+	AARCH64_VALID_CALL_TARGET
+	mov	x15, xzr
+	ret
+
+
+.globl	abi_test_clobber_x16
+
+.align	4
+abi_test_clobber_x16:
+	AARCH64_VALID_CALL_TARGET
+	mov	x16, xzr
+	ret
+
+
+.globl	abi_test_clobber_x17
+
+.align	4
+abi_test_clobber_x17:
+	AARCH64_VALID_CALL_TARGET
+	mov	x17, xzr
+	ret
+
+
+.globl	abi_test_clobber_x19
+
+.align	4
+abi_test_clobber_x19:
+	AARCH64_VALID_CALL_TARGET
+	mov	x19, xzr
+	ret
+
+
+.globl	abi_test_clobber_x20
+
+.align	4
+abi_test_clobber_x20:
+	AARCH64_VALID_CALL_TARGET
+	mov	x20, xzr
+	ret
+
+
+.globl	abi_test_clobber_x21
+
+.align	4
+abi_test_clobber_x21:
+	AARCH64_VALID_CALL_TARGET
+	mov	x21, xzr
+	ret
+
+
+.globl	abi_test_clobber_x22
+
+.align	4
+abi_test_clobber_x22:
+	AARCH64_VALID_CALL_TARGET
+	mov	x22, xzr
+	ret
+
+
+.globl	abi_test_clobber_x23
+
+.align	4
+abi_test_clobber_x23:
+	AARCH64_VALID_CALL_TARGET
+	mov	x23, xzr
+	ret
+
+
+.globl	abi_test_clobber_x24
+
+.align	4
+abi_test_clobber_x24:
+	AARCH64_VALID_CALL_TARGET
+	mov	x24, xzr
+	ret
+
+
+.globl	abi_test_clobber_x25
+
+.align	4
+abi_test_clobber_x25:
+	AARCH64_VALID_CALL_TARGET
+	mov	x25, xzr
+	ret
+
+
+.globl	abi_test_clobber_x26
+
+.align	4
+abi_test_clobber_x26:
+	AARCH64_VALID_CALL_TARGET
+	mov	x26, xzr
+	ret
+
+
+.globl	abi_test_clobber_x27
+
+.align	4
+abi_test_clobber_x27:
+	AARCH64_VALID_CALL_TARGET
+	mov	x27, xzr
+	ret
+
+
+.globl	abi_test_clobber_x28
+
+.align	4
+abi_test_clobber_x28:
+	AARCH64_VALID_CALL_TARGET
+	mov	x28, xzr
+	ret
+
+
+.globl	abi_test_clobber_x29
+
+.align	4
+abi_test_clobber_x29:
+	AARCH64_VALID_CALL_TARGET
+	mov	x29, xzr
+	ret
+
+
+.globl	abi_test_clobber_d0
+
+.align	4
+abi_test_clobber_d0:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d0, xzr
+	ret
+
+
+.globl	abi_test_clobber_d1
+
+.align	4
+abi_test_clobber_d1:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d1, xzr
+	ret
+
+
+.globl	abi_test_clobber_d2
+
+.align	4
+abi_test_clobber_d2:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d2, xzr
+	ret
+
+
+.globl	abi_test_clobber_d3
+
+.align	4
+abi_test_clobber_d3:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d3, xzr
+	ret
+
+
+.globl	abi_test_clobber_d4
+
+.align	4
+abi_test_clobber_d4:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d4, xzr
+	ret
+
+
+.globl	abi_test_clobber_d5
+
+.align	4
+abi_test_clobber_d5:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d5, xzr
+	ret
+
+
+.globl	abi_test_clobber_d6
+
+.align	4
+abi_test_clobber_d6:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d6, xzr
+	ret
+
+
+.globl	abi_test_clobber_d7
+
+.align	4
+abi_test_clobber_d7:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d7, xzr
+	ret
+
+
+.globl	abi_test_clobber_d8
+
+.align	4
+abi_test_clobber_d8:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d8, xzr
+	ret
+
+
+.globl	abi_test_clobber_d9
+
+.align	4
+abi_test_clobber_d9:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d9, xzr
+	ret
+
+
+.globl	abi_test_clobber_d10
+
+.align	4
+abi_test_clobber_d10:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d10, xzr
+	ret
+
+
+.globl	abi_test_clobber_d11
+
+.align	4
+abi_test_clobber_d11:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d11, xzr
+	ret
+
+
+.globl	abi_test_clobber_d12
+
+.align	4
+abi_test_clobber_d12:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d12, xzr
+	ret
+
+
+.globl	abi_test_clobber_d13
+
+.align	4
+abi_test_clobber_d13:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d13, xzr
+	ret
+
+
+.globl	abi_test_clobber_d14
+
+.align	4
+abi_test_clobber_d14:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d14, xzr
+	ret
+
+
+.globl	abi_test_clobber_d15
+
+.align	4
+abi_test_clobber_d15:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d15, xzr
+	ret
+
+
+.globl	abi_test_clobber_d16
+
+.align	4
+abi_test_clobber_d16:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d16, xzr
+	ret
+
+
+.globl	abi_test_clobber_d17
+
+.align	4
+abi_test_clobber_d17:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d17, xzr
+	ret
+
+
+.globl	abi_test_clobber_d18
+
+.align	4
+abi_test_clobber_d18:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d18, xzr
+	ret
+
+
+.globl	abi_test_clobber_d19
+
+.align	4
+abi_test_clobber_d19:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d19, xzr
+	ret
+
+
+.globl	abi_test_clobber_d20
+
+.align	4
+abi_test_clobber_d20:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d20, xzr
+	ret
+
+
+.globl	abi_test_clobber_d21
+
+.align	4
+abi_test_clobber_d21:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d21, xzr
+	ret
+
+
+.globl	abi_test_clobber_d22
+
+.align	4
+abi_test_clobber_d22:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d22, xzr
+	ret
+
+
+.globl	abi_test_clobber_d23
+
+.align	4
+abi_test_clobber_d23:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d23, xzr
+	ret
+
+
+.globl	abi_test_clobber_d24
+
+.align	4
+abi_test_clobber_d24:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d24, xzr
+	ret
+
+
+.globl	abi_test_clobber_d25
+
+.align	4
+abi_test_clobber_d25:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d25, xzr
+	ret
+
+
+.globl	abi_test_clobber_d26
+
+.align	4
+abi_test_clobber_d26:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d26, xzr
+	ret
+
+
+.globl	abi_test_clobber_d27
+
+.align	4
+abi_test_clobber_d27:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d27, xzr
+	ret
+
+
+.globl	abi_test_clobber_d28
+
+.align	4
+abi_test_clobber_d28:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d28, xzr
+	ret
+
+
+.globl	abi_test_clobber_d29
+
+.align	4
+abi_test_clobber_d29:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d29, xzr
+	ret
+
+
+.globl	abi_test_clobber_d30
+
+.align	4
+abi_test_clobber_d30:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d30, xzr
+	ret
+
+
+.globl	abi_test_clobber_d31
+
+.align	4
+abi_test_clobber_d31:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d31, xzr
+	ret
+
+
+.globl	abi_test_clobber_v8_upper
+
+.align	4
+abi_test_clobber_v8_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v8.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v9_upper
+
+.align	4
+abi_test_clobber_v9_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v9.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v10_upper
+
+.align	4
+abi_test_clobber_v10_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v10.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v11_upper
+
+.align	4
+abi_test_clobber_v11_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v11.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v12_upper
+
+.align	4
+abi_test_clobber_v12_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v12.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v13_upper
+
+.align	4
+abi_test_clobber_v13_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v13.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v14_upper
+
+.align	4
+abi_test_clobber_v14_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v14.d[1], xzr
+	ret
+
+
+.globl	abi_test_clobber_v15_upper
+
+.align	4
+abi_test_clobber_v15_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v15.d[1], xzr
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/test_support/trampoline-x86-apple.S b/gen/test_support/trampoline-x86-apple.S
new file mode 100644
index 0000000..4065b9a
--- /dev/null
+++ b/gen/test_support/trampoline-x86-apple.S
@@ -0,0 +1,168 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_abi_test_trampoline
+.private_extern	_abi_test_trampoline
+.align	4
+_abi_test_trampoline:
+L_abi_test_trampoline_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%ecx
+	movl	(%ecx),%esi
+	movl	4(%ecx),%edi
+	movl	8(%ecx),%ebx
+	movl	12(%ecx),%ebp
+	subl	$44,%esp
+	movl	72(%esp),%eax
+	xorl	%ecx,%ecx
+L000loop:
+	cmpl	76(%esp),%ecx
+	jae	L001loop_done
+	movl	(%eax,%ecx,4),%edx
+	movl	%edx,(%esp,%ecx,4)
+	addl	$1,%ecx
+	jmp	L000loop
+L001loop_done:
+	call	*64(%esp)
+	addl	$44,%esp
+	movl	24(%esp),%ecx
+	movl	%esi,(%ecx)
+	movl	%edi,4(%ecx)
+	movl	%ebx,8(%ecx)
+	movl	%ebp,12(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_abi_test_get_and_clear_direction_flag
+.private_extern	_abi_test_get_and_clear_direction_flag
+.align	4
+_abi_test_get_and_clear_direction_flag:
+L_abi_test_get_and_clear_direction_flag_begin:
+	pushfl
+	popl	%eax
+	andl	$1024,%eax
+	shrl	$10,%eax
+	cld
+	ret
+.globl	_abi_test_set_direction_flag
+.private_extern	_abi_test_set_direction_flag
+.align	4
+_abi_test_set_direction_flag:
+L_abi_test_set_direction_flag_begin:
+	std
+	ret
+.globl	_abi_test_clobber_eax
+.private_extern	_abi_test_clobber_eax
+.align	4
+_abi_test_clobber_eax:
+L_abi_test_clobber_eax_begin:
+	xorl	%eax,%eax
+	ret
+.globl	_abi_test_clobber_ebx
+.private_extern	_abi_test_clobber_ebx
+.align	4
+_abi_test_clobber_ebx:
+L_abi_test_clobber_ebx_begin:
+	xorl	%ebx,%ebx
+	ret
+.globl	_abi_test_clobber_ecx
+.private_extern	_abi_test_clobber_ecx
+.align	4
+_abi_test_clobber_ecx:
+L_abi_test_clobber_ecx_begin:
+	xorl	%ecx,%ecx
+	ret
+.globl	_abi_test_clobber_edx
+.private_extern	_abi_test_clobber_edx
+.align	4
+_abi_test_clobber_edx:
+L_abi_test_clobber_edx_begin:
+	xorl	%edx,%edx
+	ret
+.globl	_abi_test_clobber_edi
+.private_extern	_abi_test_clobber_edi
+.align	4
+_abi_test_clobber_edi:
+L_abi_test_clobber_edi_begin:
+	xorl	%edi,%edi
+	ret
+.globl	_abi_test_clobber_esi
+.private_extern	_abi_test_clobber_esi
+.align	4
+_abi_test_clobber_esi:
+L_abi_test_clobber_esi_begin:
+	xorl	%esi,%esi
+	ret
+.globl	_abi_test_clobber_ebp
+.private_extern	_abi_test_clobber_ebp
+.align	4
+_abi_test_clobber_ebp:
+L_abi_test_clobber_ebp_begin:
+	xorl	%ebp,%ebp
+	ret
+.globl	_abi_test_clobber_xmm0
+.private_extern	_abi_test_clobber_xmm0
+.align	4
+_abi_test_clobber_xmm0:
+L_abi_test_clobber_xmm0_begin:
+	pxor	%xmm0,%xmm0
+	ret
+.globl	_abi_test_clobber_xmm1
+.private_extern	_abi_test_clobber_xmm1
+.align	4
+_abi_test_clobber_xmm1:
+L_abi_test_clobber_xmm1_begin:
+	pxor	%xmm1,%xmm1
+	ret
+.globl	_abi_test_clobber_xmm2
+.private_extern	_abi_test_clobber_xmm2
+.align	4
+_abi_test_clobber_xmm2:
+L_abi_test_clobber_xmm2_begin:
+	pxor	%xmm2,%xmm2
+	ret
+.globl	_abi_test_clobber_xmm3
+.private_extern	_abi_test_clobber_xmm3
+.align	4
+_abi_test_clobber_xmm3:
+L_abi_test_clobber_xmm3_begin:
+	pxor	%xmm3,%xmm3
+	ret
+.globl	_abi_test_clobber_xmm4
+.private_extern	_abi_test_clobber_xmm4
+.align	4
+_abi_test_clobber_xmm4:
+L_abi_test_clobber_xmm4_begin:
+	pxor	%xmm4,%xmm4
+	ret
+.globl	_abi_test_clobber_xmm5
+.private_extern	_abi_test_clobber_xmm5
+.align	4
+_abi_test_clobber_xmm5:
+L_abi_test_clobber_xmm5_begin:
+	pxor	%xmm5,%xmm5
+	ret
+.globl	_abi_test_clobber_xmm6
+.private_extern	_abi_test_clobber_xmm6
+.align	4
+_abi_test_clobber_xmm6:
+L_abi_test_clobber_xmm6_begin:
+	pxor	%xmm6,%xmm6
+	ret
+.globl	_abi_test_clobber_xmm7
+.private_extern	_abi_test_clobber_xmm7
+.align	4
+_abi_test_clobber_xmm7:
+L_abi_test_clobber_xmm7_begin:
+	pxor	%xmm7,%xmm7
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/test_support/trampoline-x86-linux.S b/gen/test_support/trampoline-x86-linux.S
new file mode 100644
index 0000000..3452c63
--- /dev/null
+++ b/gen/test_support/trampoline-x86-linux.S
@@ -0,0 +1,204 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.type	abi_test_trampoline,@function
+.align	16
+abi_test_trampoline:
+.L_abi_test_trampoline_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%ecx
+	movl	(%ecx),%esi
+	movl	4(%ecx),%edi
+	movl	8(%ecx),%ebx
+	movl	12(%ecx),%ebp
+	subl	$44,%esp
+	movl	72(%esp),%eax
+	xorl	%ecx,%ecx
+.L000loop:
+	cmpl	76(%esp),%ecx
+	jae	.L001loop_done
+	movl	(%eax,%ecx,4),%edx
+	movl	%edx,(%esp,%ecx,4)
+	addl	$1,%ecx
+	jmp	.L000loop
+.L001loop_done:
+	call	*64(%esp)
+	addl	$44,%esp
+	movl	24(%esp),%ecx
+	movl	%esi,(%ecx)
+	movl	%edi,4(%ecx)
+	movl	%ebx,8(%ecx)
+	movl	%ebp,12(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	abi_test_trampoline,.-.L_abi_test_trampoline_begin
+.globl	abi_test_get_and_clear_direction_flag
+.hidden	abi_test_get_and_clear_direction_flag
+.type	abi_test_get_and_clear_direction_flag,@function
+.align	16
+abi_test_get_and_clear_direction_flag:
+.L_abi_test_get_and_clear_direction_flag_begin:
+	pushfl
+	popl	%eax
+	andl	$1024,%eax
+	shrl	$10,%eax
+	cld
+	ret
+.size	abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin
+.globl	abi_test_set_direction_flag
+.hidden	abi_test_set_direction_flag
+.type	abi_test_set_direction_flag,@function
+.align	16
+abi_test_set_direction_flag:
+.L_abi_test_set_direction_flag_begin:
+	std
+	ret
+.size	abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin
+.globl	abi_test_clobber_eax
+.hidden	abi_test_clobber_eax
+.type	abi_test_clobber_eax,@function
+.align	16
+abi_test_clobber_eax:
+.L_abi_test_clobber_eax_begin:
+	xorl	%eax,%eax
+	ret
+.size	abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin
+.globl	abi_test_clobber_ebx
+.hidden	abi_test_clobber_ebx
+.type	abi_test_clobber_ebx,@function
+.align	16
+abi_test_clobber_ebx:
+.L_abi_test_clobber_ebx_begin:
+	xorl	%ebx,%ebx
+	ret
+.size	abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin
+.globl	abi_test_clobber_ecx
+.hidden	abi_test_clobber_ecx
+.type	abi_test_clobber_ecx,@function
+.align	16
+abi_test_clobber_ecx:
+.L_abi_test_clobber_ecx_begin:
+	xorl	%ecx,%ecx
+	ret
+.size	abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin
+.globl	abi_test_clobber_edx
+.hidden	abi_test_clobber_edx
+.type	abi_test_clobber_edx,@function
+.align	16
+abi_test_clobber_edx:
+.L_abi_test_clobber_edx_begin:
+	xorl	%edx,%edx
+	ret
+.size	abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin
+.globl	abi_test_clobber_edi
+.hidden	abi_test_clobber_edi
+.type	abi_test_clobber_edi,@function
+.align	16
+abi_test_clobber_edi:
+.L_abi_test_clobber_edi_begin:
+	xorl	%edi,%edi
+	ret
+.size	abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin
+.globl	abi_test_clobber_esi
+.hidden	abi_test_clobber_esi
+.type	abi_test_clobber_esi,@function
+.align	16
+abi_test_clobber_esi:
+.L_abi_test_clobber_esi_begin:
+	xorl	%esi,%esi
+	ret
+.size	abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin
+.globl	abi_test_clobber_ebp
+.hidden	abi_test_clobber_ebp
+.type	abi_test_clobber_ebp,@function
+.align	16
+abi_test_clobber_ebp:
+.L_abi_test_clobber_ebp_begin:
+	xorl	%ebp,%ebp
+	ret
+.size	abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin
+.globl	abi_test_clobber_xmm0
+.hidden	abi_test_clobber_xmm0
+.type	abi_test_clobber_xmm0,@function
+.align	16
+abi_test_clobber_xmm0:
+.L_abi_test_clobber_xmm0_begin:
+	pxor	%xmm0,%xmm0
+	ret
+.size	abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin
+.globl	abi_test_clobber_xmm1
+.hidden	abi_test_clobber_xmm1
+.type	abi_test_clobber_xmm1,@function
+.align	16
+abi_test_clobber_xmm1:
+.L_abi_test_clobber_xmm1_begin:
+	pxor	%xmm1,%xmm1
+	ret
+.size	abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin
+.globl	abi_test_clobber_xmm2
+.hidden	abi_test_clobber_xmm2
+.type	abi_test_clobber_xmm2,@function
+.align	16
+abi_test_clobber_xmm2:
+.L_abi_test_clobber_xmm2_begin:
+	pxor	%xmm2,%xmm2
+	ret
+.size	abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin
+.globl	abi_test_clobber_xmm3
+.hidden	abi_test_clobber_xmm3
+.type	abi_test_clobber_xmm3,@function
+.align	16
+abi_test_clobber_xmm3:
+.L_abi_test_clobber_xmm3_begin:
+	pxor	%xmm3,%xmm3
+	ret
+.size	abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin
+.globl	abi_test_clobber_xmm4
+.hidden	abi_test_clobber_xmm4
+.type	abi_test_clobber_xmm4,@function
+.align	16
+abi_test_clobber_xmm4:
+.L_abi_test_clobber_xmm4_begin:
+	pxor	%xmm4,%xmm4
+	ret
+.size	abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin
+.globl	abi_test_clobber_xmm5
+.hidden	abi_test_clobber_xmm5
+.type	abi_test_clobber_xmm5,@function
+.align	16
+abi_test_clobber_xmm5:
+.L_abi_test_clobber_xmm5_begin:
+	pxor	%xmm5,%xmm5
+	ret
+.size	abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin
+.globl	abi_test_clobber_xmm6
+.hidden	abi_test_clobber_xmm6
+.type	abi_test_clobber_xmm6,@function
+.align	16
+abi_test_clobber_xmm6:
+.L_abi_test_clobber_xmm6_begin:
+	pxor	%xmm6,%xmm6
+	ret
+.size	abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin
+.globl	abi_test_clobber_xmm7
+.hidden	abi_test_clobber_xmm7
+.type	abi_test_clobber_xmm7,@function
+.align	16
+abi_test_clobber_xmm7:
+.L_abi_test_clobber_xmm7_begin:
+	pxor	%xmm7,%xmm7
+	ret
+.size	abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-x86-win.asm b/gen/test_support/trampoline-x86-win.asm
new file mode 100644
index 0000000..3ef9917
--- /dev/null
+++ b/gen/test_support/trampoline-x86-win.asm
@@ -0,0 +1,161 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_abi_test_trampoline
+align	16
+_abi_test_trampoline:
+L$_abi_test_trampoline_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ecx,DWORD [24+esp]
+	mov	esi,DWORD [ecx]
+	mov	edi,DWORD [4+ecx]
+	mov	ebx,DWORD [8+ecx]
+	mov	ebp,DWORD [12+ecx]
+	sub	esp,44
+	mov	eax,DWORD [72+esp]
+	xor	ecx,ecx
+L$000loop:
+	cmp	ecx,DWORD [76+esp]
+	jae	NEAR L$001loop_done
+	mov	edx,DWORD [ecx*4+eax]
+	mov	DWORD [ecx*4+esp],edx
+	add	ecx,1
+	jmp	NEAR L$000loop
+L$001loop_done:
+	call	DWORD [64+esp]
+	add	esp,44
+	mov	ecx,DWORD [24+esp]
+	mov	DWORD [ecx],esi
+	mov	DWORD [4+ecx],edi
+	mov	DWORD [8+ecx],ebx
+	mov	DWORD [12+ecx],ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_abi_test_get_and_clear_direction_flag
+align	16
+_abi_test_get_and_clear_direction_flag:
+L$_abi_test_get_and_clear_direction_flag_begin:
+	pushfd
+	pop	eax
+	and	eax,1024
+	shr	eax,10
+	cld
+	ret
+global	_abi_test_set_direction_flag
+align	16
+_abi_test_set_direction_flag:
+L$_abi_test_set_direction_flag_begin:
+	std
+	ret
+global	_abi_test_clobber_eax
+align	16
+_abi_test_clobber_eax:
+L$_abi_test_clobber_eax_begin:
+	xor	eax,eax
+	ret
+global	_abi_test_clobber_ebx
+align	16
+_abi_test_clobber_ebx:
+L$_abi_test_clobber_ebx_begin:
+	xor	ebx,ebx
+	ret
+global	_abi_test_clobber_ecx
+align	16
+_abi_test_clobber_ecx:
+L$_abi_test_clobber_ecx_begin:
+	xor	ecx,ecx
+	ret
+global	_abi_test_clobber_edx
+align	16
+_abi_test_clobber_edx:
+L$_abi_test_clobber_edx_begin:
+	xor	edx,edx
+	ret
+global	_abi_test_clobber_edi
+align	16
+_abi_test_clobber_edi:
+L$_abi_test_clobber_edi_begin:
+	xor	edi,edi
+	ret
+global	_abi_test_clobber_esi
+align	16
+_abi_test_clobber_esi:
+L$_abi_test_clobber_esi_begin:
+	xor	esi,esi
+	ret
+global	_abi_test_clobber_ebp
+align	16
+_abi_test_clobber_ebp:
+L$_abi_test_clobber_ebp_begin:
+	xor	ebp,ebp
+	ret
+global	_abi_test_clobber_xmm0
+align	16
+_abi_test_clobber_xmm0:
+L$_abi_test_clobber_xmm0_begin:
+	pxor	xmm0,xmm0
+	ret
+global	_abi_test_clobber_xmm1
+align	16
+_abi_test_clobber_xmm1:
+L$_abi_test_clobber_xmm1_begin:
+	pxor	xmm1,xmm1
+	ret
+global	_abi_test_clobber_xmm2
+align	16
+_abi_test_clobber_xmm2:
+L$_abi_test_clobber_xmm2_begin:
+	pxor	xmm2,xmm2
+	ret
+global	_abi_test_clobber_xmm3
+align	16
+_abi_test_clobber_xmm3:
+L$_abi_test_clobber_xmm3_begin:
+	pxor	xmm3,xmm3
+	ret
+global	_abi_test_clobber_xmm4
+align	16
+_abi_test_clobber_xmm4:
+L$_abi_test_clobber_xmm4_begin:
+	pxor	xmm4,xmm4
+	ret
+global	_abi_test_clobber_xmm5
+align	16
+_abi_test_clobber_xmm5:
+L$_abi_test_clobber_xmm5_begin:
+	pxor	xmm5,xmm5
+	ret
+global	_abi_test_clobber_xmm6
+align	16
+_abi_test_clobber_xmm6:
+L$_abi_test_clobber_xmm6_begin:
+	pxor	xmm6,xmm6
+	ret
+global	_abi_test_clobber_xmm7
+align	16
+_abi_test_clobber_xmm7:
+L$_abi_test_clobber_xmm7_begin:
+	pxor	xmm7,xmm7
+	ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/test_support/trampoline-x86_64-apple.S b/gen/test_support/trampoline-x86_64-apple.S
new file mode 100644
index 0000000..7c76d2d
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-apple.S
@@ -0,0 +1,541 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text	
+
+
+
+
+
+
+
+
+
+.globl	_abi_test_trampoline
+.private_extern _abi_test_trampoline
+.p2align	4
+_abi_test_trampoline:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+	subq	$120,%rsp
+
+
+	movq	%r8,48(%rsp)
+	movq	%rbx,64(%rsp)
+
+
+	movq	%rbp,72(%rsp)
+
+
+	movq	%r12,80(%rsp)
+
+
+	movq	%r13,88(%rsp)
+
+
+	movq	%r14,96(%rsp)
+
+
+	movq	%r15,104(%rsp)
+
+
+	movq	0(%rsi),%rbx
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+
+
+
+
+	movq	%rdx,%r10
+	movq	%rcx,%r11
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%rdi
+	addq	$8,%r10
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%rsi
+	addq	$8,%r10
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%rdx
+	addq	$8,%r10
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%rcx
+	addq	$8,%r10
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%r8
+	addq	$8,%r10
+	decq	%r11
+	js	L$args_done
+	movq	(%r10),%r9
+	addq	$8,%r10
+	leaq	0(%rsp),%rax
+L$args_loop:
+	decq	%r11
+	js	L$args_done
+
+
+
+
+
+
+	movq	%r11,56(%rsp)
+	movq	(%r10),%r11
+	movq	%r11,(%rax)
+	movq	56(%rsp),%r11
+
+	addq	$8,%r10
+	addq	$8,%rax
+	jmp	L$args_loop
+
+L$args_done:
+	movq	32(%rsp),%rax
+	movq	48(%rsp),%r10
+	testq	%r10,%r10
+	jz	L$no_unwind
+
+
+	pushfq
+	orq	$0x100,0(%rsp)
+	popfq
+
+
+
+	nop
+.globl	_abi_test_unwind_start
+.private_extern _abi_test_unwind_start
+_abi_test_unwind_start:
+
+	call	*%rax
+.globl	_abi_test_unwind_return
+.private_extern _abi_test_unwind_return
+_abi_test_unwind_return:
+
+
+
+
+	pushfq
+	andq	$-0x101,0(%rsp)
+	popfq
+.globl	_abi_test_unwind_stop
+.private_extern _abi_test_unwind_stop
+_abi_test_unwind_stop:
+
+	jmp	L$call_done
+
+L$no_unwind:
+	call	*%rax
+
+L$call_done:
+
+	movq	40(%rsp),%rsi
+	movq	%rbx,0(%rsi)
+	movq	%rbp,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	64(%rsp),%rbx
+
+	movq	72(%rsp),%rbp
+
+	movq	80(%rsp),%r12
+
+	movq	88(%rsp),%r13
+
+	movq	96(%rsp),%r14
+
+	movq	104(%rsp),%r15
+
+	addq	$120,%rsp
+
+
+
+	ret
+
+
+
+
+.globl	_abi_test_clobber_rax
+.private_extern _abi_test_clobber_rax
+.p2align	4
+_abi_test_clobber_rax:
+_CET_ENDBR
+	xorq	%rax,%rax
+	ret
+
+
+.globl	_abi_test_clobber_rbx
+.private_extern _abi_test_clobber_rbx
+.p2align	4
+_abi_test_clobber_rbx:
+_CET_ENDBR
+	xorq	%rbx,%rbx
+	ret
+
+
+.globl	_abi_test_clobber_rcx
+.private_extern _abi_test_clobber_rcx
+.p2align	4
+_abi_test_clobber_rcx:
+_CET_ENDBR
+	xorq	%rcx,%rcx
+	ret
+
+
+.globl	_abi_test_clobber_rdx
+.private_extern _abi_test_clobber_rdx
+.p2align	4
+_abi_test_clobber_rdx:
+_CET_ENDBR
+	xorq	%rdx,%rdx
+	ret
+
+
+.globl	_abi_test_clobber_rdi
+.private_extern _abi_test_clobber_rdi
+.p2align	4
+_abi_test_clobber_rdi:
+_CET_ENDBR
+	xorq	%rdi,%rdi
+	ret
+
+
+.globl	_abi_test_clobber_rsi
+.private_extern _abi_test_clobber_rsi
+.p2align	4
+_abi_test_clobber_rsi:
+_CET_ENDBR
+	xorq	%rsi,%rsi
+	ret
+
+
+.globl	_abi_test_clobber_rbp
+.private_extern _abi_test_clobber_rbp
+.p2align	4
+_abi_test_clobber_rbp:
+_CET_ENDBR
+	xorq	%rbp,%rbp
+	ret
+
+
+.globl	_abi_test_clobber_r8
+.private_extern _abi_test_clobber_r8
+.p2align	4
+_abi_test_clobber_r8:
+_CET_ENDBR
+	xorq	%r8,%r8
+	ret
+
+
+.globl	_abi_test_clobber_r9
+.private_extern _abi_test_clobber_r9
+.p2align	4
+_abi_test_clobber_r9:
+_CET_ENDBR
+	xorq	%r9,%r9
+	ret
+
+
+.globl	_abi_test_clobber_r10
+.private_extern _abi_test_clobber_r10
+.p2align	4
+_abi_test_clobber_r10:
+_CET_ENDBR
+	xorq	%r10,%r10
+	ret
+
+
+.globl	_abi_test_clobber_r11
+.private_extern _abi_test_clobber_r11
+.p2align	4
+_abi_test_clobber_r11:
+_CET_ENDBR
+	xorq	%r11,%r11
+	ret
+
+
+.globl	_abi_test_clobber_r12
+.private_extern _abi_test_clobber_r12
+.p2align	4
+_abi_test_clobber_r12:
+_CET_ENDBR
+	xorq	%r12,%r12
+	ret
+
+
+.globl	_abi_test_clobber_r13
+.private_extern _abi_test_clobber_r13
+.p2align	4
+_abi_test_clobber_r13:
+_CET_ENDBR
+	xorq	%r13,%r13
+	ret
+
+
+.globl	_abi_test_clobber_r14
+.private_extern _abi_test_clobber_r14
+.p2align	4
+_abi_test_clobber_r14:
+_CET_ENDBR
+	xorq	%r14,%r14
+	ret
+
+
+.globl	_abi_test_clobber_r15
+.private_extern _abi_test_clobber_r15
+.p2align	4
+_abi_test_clobber_r15:
+_CET_ENDBR
+	xorq	%r15,%r15
+	ret
+
+
+.globl	_abi_test_clobber_xmm0
+.private_extern _abi_test_clobber_xmm0
+.p2align	4
+_abi_test_clobber_xmm0:
+_CET_ENDBR
+	pxor	%xmm0,%xmm0
+	ret
+
+
+.globl	_abi_test_clobber_xmm1
+.private_extern _abi_test_clobber_xmm1
+.p2align	4
+_abi_test_clobber_xmm1:
+_CET_ENDBR
+	pxor	%xmm1,%xmm1
+	ret
+
+
+.globl	_abi_test_clobber_xmm2
+.private_extern _abi_test_clobber_xmm2
+.p2align	4
+_abi_test_clobber_xmm2:
+_CET_ENDBR
+	pxor	%xmm2,%xmm2
+	ret
+
+
+.globl	_abi_test_clobber_xmm3
+.private_extern _abi_test_clobber_xmm3
+.p2align	4
+_abi_test_clobber_xmm3:
+_CET_ENDBR
+	pxor	%xmm3,%xmm3
+	ret
+
+
+.globl	_abi_test_clobber_xmm4
+.private_extern _abi_test_clobber_xmm4
+.p2align	4
+_abi_test_clobber_xmm4:
+_CET_ENDBR
+	pxor	%xmm4,%xmm4
+	ret
+
+
+.globl	_abi_test_clobber_xmm5
+.private_extern _abi_test_clobber_xmm5
+.p2align	4
+_abi_test_clobber_xmm5:
+_CET_ENDBR
+	pxor	%xmm5,%xmm5
+	ret
+
+
+.globl	_abi_test_clobber_xmm6
+.private_extern _abi_test_clobber_xmm6
+.p2align	4
+_abi_test_clobber_xmm6:
+_CET_ENDBR
+	pxor	%xmm6,%xmm6
+	ret
+
+
+.globl	_abi_test_clobber_xmm7
+.private_extern _abi_test_clobber_xmm7
+.p2align	4
+_abi_test_clobber_xmm7:
+_CET_ENDBR
+	pxor	%xmm7,%xmm7
+	ret
+
+
+.globl	_abi_test_clobber_xmm8
+.private_extern _abi_test_clobber_xmm8
+.p2align	4
+_abi_test_clobber_xmm8:
+_CET_ENDBR
+	pxor	%xmm8,%xmm8
+	ret
+
+
+.globl	_abi_test_clobber_xmm9
+.private_extern _abi_test_clobber_xmm9
+.p2align	4
+_abi_test_clobber_xmm9:
+_CET_ENDBR
+	pxor	%xmm9,%xmm9
+	ret
+
+
+.globl	_abi_test_clobber_xmm10
+.private_extern _abi_test_clobber_xmm10
+.p2align	4
+_abi_test_clobber_xmm10:
+_CET_ENDBR
+	pxor	%xmm10,%xmm10
+	ret
+
+
+.globl	_abi_test_clobber_xmm11
+.private_extern _abi_test_clobber_xmm11
+.p2align	4
+_abi_test_clobber_xmm11:
+_CET_ENDBR
+	pxor	%xmm11,%xmm11
+	ret
+
+
+.globl	_abi_test_clobber_xmm12
+.private_extern _abi_test_clobber_xmm12
+.p2align	4
+_abi_test_clobber_xmm12:
+_CET_ENDBR
+	pxor	%xmm12,%xmm12
+	ret
+
+
+.globl	_abi_test_clobber_xmm13
+.private_extern _abi_test_clobber_xmm13
+.p2align	4
+_abi_test_clobber_xmm13:
+_CET_ENDBR
+	pxor	%xmm13,%xmm13
+	ret
+
+
+.globl	_abi_test_clobber_xmm14
+.private_extern _abi_test_clobber_xmm14
+.p2align	4
+_abi_test_clobber_xmm14:
+_CET_ENDBR
+	pxor	%xmm14,%xmm14
+	ret
+
+
+.globl	_abi_test_clobber_xmm15
+.private_extern _abi_test_clobber_xmm15
+.p2align	4
+_abi_test_clobber_xmm15:
+_CET_ENDBR
+	pxor	%xmm15,%xmm15
+	ret
+
+
+
+
+
+.globl	_abi_test_bad_unwind_wrong_register
+.private_extern _abi_test_bad_unwind_wrong_register
+.p2align	4
+_abi_test_bad_unwind_wrong_register:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+
+
+
+	nop
+	popq	%r12
+
+	ret
+
+
+
+
+
+
+
+
+.globl	_abi_test_bad_unwind_temporary
+.private_extern _abi_test_bad_unwind_temporary
+.p2align	4
+_abi_test_bad_unwind_temporary:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+
+	movq	%r12,%rax
+	incq	%rax
+	movq	%rax,(%rsp)
+
+
+
+	movq	%r12,(%rsp)
+
+
+	popq	%r12
+
+	ret
+
+
+
+
+
+
+
+
+.globl	_abi_test_get_and_clear_direction_flag
+.private_extern _abi_test_get_and_clear_direction_flag
+_abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+	pushfq
+	popq	%rax
+	andq	$0x400,%rax
+	shrq	$10,%rax
+	cld
+	ret
+
+
+
+
+
+.globl	_abi_test_set_direction_flag
+.private_extern _abi_test_set_direction_flag
+_abi_test_set_direction_flag:
+_CET_ENDBR
+	std
+	ret
+
+#endif
diff --git a/gen/test_support/trampoline-x86_64-linux.S b/gen/test_support/trampoline-x86_64-linux.S
new file mode 100644
index 0000000..93af8b9
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-linux.S
@@ -0,0 +1,545 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text	
+
+
+
+
+
+
+
+
+.type	abi_test_trampoline, @function
+.globl	abi_test_trampoline
+.hidden abi_test_trampoline
+.align	16
+abi_test_trampoline:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+	subq	$120,%rsp
+.cfi_adjust_cfa_offset	120
+
+	movq	%r8,48(%rsp)
+	movq	%rbx,64(%rsp)
+.cfi_offset	rbx, -64
+
+	movq	%rbp,72(%rsp)
+.cfi_offset	rbp, -56
+
+	movq	%r12,80(%rsp)
+.cfi_offset	r12, -48
+
+	movq	%r13,88(%rsp)
+.cfi_offset	r13, -40
+
+	movq	%r14,96(%rsp)
+.cfi_offset	r14, -32
+
+	movq	%r15,104(%rsp)
+.cfi_offset	r15, -24
+
+	movq	0(%rsi),%rbx
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+
+
+
+
+	movq	%rdx,%r10
+	movq	%rcx,%r11
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rdi
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rsi
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rdx
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rcx
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%r8
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%r9
+	addq	$8,%r10
+	leaq	0(%rsp),%rax
+.Largs_loop:
+	decq	%r11
+	js	.Largs_done
+
+
+
+
+
+
+	movq	%r11,56(%rsp)
+	movq	(%r10),%r11
+	movq	%r11,(%rax)
+	movq	56(%rsp),%r11
+
+	addq	$8,%r10
+	addq	$8,%rax
+	jmp	.Largs_loop
+
+.Largs_done:
+	movq	32(%rsp),%rax
+	movq	48(%rsp),%r10
+	testq	%r10,%r10
+	jz	.Lno_unwind
+
+
+	pushfq
+	orq	$0x100,0(%rsp)
+	popfq
+
+
+
+	nop
+.globl	abi_test_unwind_start
+.hidden abi_test_unwind_start
+abi_test_unwind_start:
+
+	call	*%rax
+.globl	abi_test_unwind_return
+.hidden abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+	pushfq
+	andq	$-0x101,0(%rsp)
+	popfq
+.globl	abi_test_unwind_stop
+.hidden abi_test_unwind_stop
+abi_test_unwind_stop:
+
+	jmp	.Lcall_done
+
+.Lno_unwind:
+	call	*%rax
+
+.Lcall_done:
+
+	movq	40(%rsp),%rsi
+	movq	%rbx,0(%rsi)
+	movq	%rbp,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	64(%rsp),%rbx
+.cfi_restore	rbx
+	movq	72(%rsp),%rbp
+.cfi_restore	rbp
+	movq	80(%rsp),%r12
+.cfi_restore	r12
+	movq	88(%rsp),%r13
+.cfi_restore	r13
+	movq	96(%rsp),%r14
+.cfi_restore	r14
+	movq	104(%rsp),%r15
+.cfi_restore	r15
+	addq	$120,%rsp
+.cfi_adjust_cfa_offset	-120
+
+
+	ret
+.cfi_endproc	
+
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_rax, @function
+.globl	abi_test_clobber_rax
+.hidden abi_test_clobber_rax
+.align	16
+abi_test_clobber_rax:
+_CET_ENDBR
+	xorq	%rax,%rax
+	ret
+.size	abi_test_clobber_rax,.-abi_test_clobber_rax
+.type	abi_test_clobber_rbx, @function
+.globl	abi_test_clobber_rbx
+.hidden abi_test_clobber_rbx
+.align	16
+abi_test_clobber_rbx:
+_CET_ENDBR
+	xorq	%rbx,%rbx
+	ret
+.size	abi_test_clobber_rbx,.-abi_test_clobber_rbx
+.type	abi_test_clobber_rcx, @function
+.globl	abi_test_clobber_rcx
+.hidden abi_test_clobber_rcx
+.align	16
+abi_test_clobber_rcx:
+_CET_ENDBR
+	xorq	%rcx,%rcx
+	ret
+.size	abi_test_clobber_rcx,.-abi_test_clobber_rcx
+.type	abi_test_clobber_rdx, @function
+.globl	abi_test_clobber_rdx
+.hidden abi_test_clobber_rdx
+.align	16
+abi_test_clobber_rdx:
+_CET_ENDBR
+	xorq	%rdx,%rdx
+	ret
+.size	abi_test_clobber_rdx,.-abi_test_clobber_rdx
+.type	abi_test_clobber_rdi, @function
+.globl	abi_test_clobber_rdi
+.hidden abi_test_clobber_rdi
+.align	16
+abi_test_clobber_rdi:
+_CET_ENDBR
+	xorq	%rdi,%rdi
+	ret
+.size	abi_test_clobber_rdi,.-abi_test_clobber_rdi
+.type	abi_test_clobber_rsi, @function
+.globl	abi_test_clobber_rsi
+.hidden abi_test_clobber_rsi
+.align	16
+abi_test_clobber_rsi:
+_CET_ENDBR
+	xorq	%rsi,%rsi
+	ret
+.size	abi_test_clobber_rsi,.-abi_test_clobber_rsi
+.type	abi_test_clobber_rbp, @function
+.globl	abi_test_clobber_rbp
+.hidden abi_test_clobber_rbp
+.align	16
+abi_test_clobber_rbp:
+_CET_ENDBR
+	xorq	%rbp,%rbp
+	ret
+.size	abi_test_clobber_rbp,.-abi_test_clobber_rbp
+.type	abi_test_clobber_r8, @function
+.globl	abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align	16
+abi_test_clobber_r8:
+_CET_ENDBR
+	xorq	%r8,%r8
+	ret
+.size	abi_test_clobber_r8,.-abi_test_clobber_r8
+.type	abi_test_clobber_r9, @function
+.globl	abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align	16
+abi_test_clobber_r9:
+_CET_ENDBR
+	xorq	%r9,%r9
+	ret
+.size	abi_test_clobber_r9,.-abi_test_clobber_r9
+.type	abi_test_clobber_r10, @function
+.globl	abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align	16
+abi_test_clobber_r10:
+_CET_ENDBR
+	xorq	%r10,%r10
+	ret
+.size	abi_test_clobber_r10,.-abi_test_clobber_r10
+.type	abi_test_clobber_r11, @function
+.globl	abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align	16
+abi_test_clobber_r11:
+_CET_ENDBR
+	xorq	%r11,%r11
+	ret
+.size	abi_test_clobber_r11,.-abi_test_clobber_r11
+.type	abi_test_clobber_r12, @function
+.globl	abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align	16
+abi_test_clobber_r12:
+_CET_ENDBR
+	xorq	%r12,%r12
+	ret
+.size	abi_test_clobber_r12,.-abi_test_clobber_r12
+.type	abi_test_clobber_r13, @function
+.globl	abi_test_clobber_r13
+.hidden abi_test_clobber_r13
+.align	16
+abi_test_clobber_r13:
+_CET_ENDBR
+	xorq	%r13,%r13
+	ret
+.size	abi_test_clobber_r13,.-abi_test_clobber_r13
+.type	abi_test_clobber_r14, @function
+.globl	abi_test_clobber_r14
+.hidden abi_test_clobber_r14
+.align	16
+abi_test_clobber_r14:
+_CET_ENDBR
+	xorq	%r14,%r14
+	ret
+.size	abi_test_clobber_r14,.-abi_test_clobber_r14
+.type	abi_test_clobber_r15, @function
+.globl	abi_test_clobber_r15
+.hidden abi_test_clobber_r15
+.align	16
+abi_test_clobber_r15:
+_CET_ENDBR
+	xorq	%r15,%r15
+	ret
+.size	abi_test_clobber_r15,.-abi_test_clobber_r15
+.type	abi_test_clobber_xmm0, @function
+.globl	abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.align	16
+abi_test_clobber_xmm0:
+_CET_ENDBR
+	pxor	%xmm0,%xmm0
+	ret
+.size	abi_test_clobber_xmm0,.-abi_test_clobber_xmm0
+.type	abi_test_clobber_xmm1, @function
+.globl	abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.align	16
+abi_test_clobber_xmm1:
+_CET_ENDBR
+	pxor	%xmm1,%xmm1
+	ret
+.size	abi_test_clobber_xmm1,.-abi_test_clobber_xmm1
+.type	abi_test_clobber_xmm2, @function
+.globl	abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.align	16
+abi_test_clobber_xmm2:
+_CET_ENDBR
+	pxor	%xmm2,%xmm2
+	ret
+.size	abi_test_clobber_xmm2,.-abi_test_clobber_xmm2
+.type	abi_test_clobber_xmm3, @function
+.globl	abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.align	16
+abi_test_clobber_xmm3:
+_CET_ENDBR
+	pxor	%xmm3,%xmm3
+	ret
+.size	abi_test_clobber_xmm3,.-abi_test_clobber_xmm3
+.type	abi_test_clobber_xmm4, @function
+.globl	abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.align	16
+abi_test_clobber_xmm4:
+_CET_ENDBR
+	pxor	%xmm4,%xmm4
+	ret
+.size	abi_test_clobber_xmm4,.-abi_test_clobber_xmm4
+.type	abi_test_clobber_xmm5, @function
+.globl	abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.align	16
+abi_test_clobber_xmm5:
+_CET_ENDBR
+	pxor	%xmm5,%xmm5
+	ret
+.size	abi_test_clobber_xmm5,.-abi_test_clobber_xmm5
+.type	abi_test_clobber_xmm6, @function
+.globl	abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.align	16
+abi_test_clobber_xmm6:
+_CET_ENDBR
+	pxor	%xmm6,%xmm6
+	ret
+.size	abi_test_clobber_xmm6,.-abi_test_clobber_xmm6
+.type	abi_test_clobber_xmm7, @function
+.globl	abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.align	16
+abi_test_clobber_xmm7:
+_CET_ENDBR
+	pxor	%xmm7,%xmm7
+	ret
+.size	abi_test_clobber_xmm7,.-abi_test_clobber_xmm7
+.type	abi_test_clobber_xmm8, @function
+.globl	abi_test_clobber_xmm8
+.hidden abi_test_clobber_xmm8
+.align	16
+abi_test_clobber_xmm8:
+_CET_ENDBR
+	pxor	%xmm8,%xmm8
+	ret
+.size	abi_test_clobber_xmm8,.-abi_test_clobber_xmm8
+.type	abi_test_clobber_xmm9, @function
+.globl	abi_test_clobber_xmm9
+.hidden abi_test_clobber_xmm9
+.align	16
+abi_test_clobber_xmm9:
+_CET_ENDBR
+	pxor	%xmm9,%xmm9
+	ret
+.size	abi_test_clobber_xmm9,.-abi_test_clobber_xmm9
+.type	abi_test_clobber_xmm10, @function
+.globl	abi_test_clobber_xmm10
+.hidden abi_test_clobber_xmm10
+.align	16
+abi_test_clobber_xmm10:
+_CET_ENDBR
+	pxor	%xmm10,%xmm10
+	ret
+.size	abi_test_clobber_xmm10,.-abi_test_clobber_xmm10
+.type	abi_test_clobber_xmm11, @function
+.globl	abi_test_clobber_xmm11
+.hidden abi_test_clobber_xmm11
+.align	16
+abi_test_clobber_xmm11:
+_CET_ENDBR
+	pxor	%xmm11,%xmm11
+	ret
+.size	abi_test_clobber_xmm11,.-abi_test_clobber_xmm11
+.type	abi_test_clobber_xmm12, @function
+.globl	abi_test_clobber_xmm12
+.hidden abi_test_clobber_xmm12
+.align	16
+abi_test_clobber_xmm12:
+_CET_ENDBR
+	pxor	%xmm12,%xmm12
+	ret
+.size	abi_test_clobber_xmm12,.-abi_test_clobber_xmm12
+.type	abi_test_clobber_xmm13, @function
+.globl	abi_test_clobber_xmm13
+.hidden abi_test_clobber_xmm13
+.align	16
+abi_test_clobber_xmm13:
+_CET_ENDBR
+	pxor	%xmm13,%xmm13
+	ret
+.size	abi_test_clobber_xmm13,.-abi_test_clobber_xmm13
+.type	abi_test_clobber_xmm14, @function
+.globl	abi_test_clobber_xmm14
+.hidden abi_test_clobber_xmm14
+.align	16
+abi_test_clobber_xmm14:
+_CET_ENDBR
+	pxor	%xmm14,%xmm14
+	ret
+.size	abi_test_clobber_xmm14,.-abi_test_clobber_xmm14
+.type	abi_test_clobber_xmm15, @function
+.globl	abi_test_clobber_xmm15
+.hidden abi_test_clobber_xmm15
+.align	16
+abi_test_clobber_xmm15:
+_CET_ENDBR
+	pxor	%xmm15,%xmm15
+	ret
+.size	abi_test_clobber_xmm15,.-abi_test_clobber_xmm15
+
+
+
+.type	abi_test_bad_unwind_wrong_register, @function
+.globl	abi_test_bad_unwind_wrong_register
+.hidden abi_test_bad_unwind_wrong_register
+.align	16
+abi_test_bad_unwind_wrong_register:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-16
+
+
+
+
+	nop
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register
+
+
+
+
+.type	abi_test_bad_unwind_temporary, @function
+.globl	abi_test_bad_unwind_temporary
+.hidden abi_test_bad_unwind_temporary
+.align	16
+abi_test_bad_unwind_temporary:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+
+	movq	%r12,%rax
+	incq	%rax
+	movq	%rax,(%rsp)
+
+
+
+	movq	%r12,(%rsp)
+
+
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+.cfi_endproc	
+
+.size	abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary
+
+
+
+
+.type	abi_test_set_direction_flag, @function
+.globl	abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+	pushfq
+	popq	%rax
+	andq	$0x400,%rax
+	shrq	$10,%rax
+	cld
+	ret
+.size	abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag
+
+
+
+.type	abi_test_set_direction_flag, @function
+.globl	abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+abi_test_set_direction_flag:
+_CET_ENDBR
+	std
+	ret
+.size	abi_test_set_direction_flag,.-abi_test_set_direction_flag
+#endif
diff --git a/gen/test_support/trampoline-x86_64-win.asm b/gen/test_support/trampoline-x86_64-win.asm
new file mode 100644
index 0000000..ae04cbe
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-win.asm
@@ -0,0 +1,715 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.text code align=64
+
+
+
+
+
+
+
+
+
+
+global	abi_test_trampoline
+ALIGN	16
+abi_test_trampoline:
+
+$L$SEH_begin_abi_test_trampoline_1:
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+	sub	rsp,344
+
+$L$SEH_prolog_abi_test_trampoline_2:
+	mov	QWORD[112+rsp],rbx
+
+$L$SEH_prolog_abi_test_trampoline_3:
+	mov	QWORD[120+rsp],rbp
+
+$L$SEH_prolog_abi_test_trampoline_4:
+	mov	QWORD[128+rsp],rdi
+
+$L$SEH_prolog_abi_test_trampoline_5:
+	mov	QWORD[136+rsp],rsi
+
+$L$SEH_prolog_abi_test_trampoline_6:
+	mov	QWORD[144+rsp],r12
+
+$L$SEH_prolog_abi_test_trampoline_7:
+	mov	QWORD[152+rsp],r13
+
+$L$SEH_prolog_abi_test_trampoline_8:
+	mov	QWORD[160+rsp],r14
+
+$L$SEH_prolog_abi_test_trampoline_9:
+	mov	QWORD[168+rsp],r15
+
+$L$SEH_prolog_abi_test_trampoline_10:
+	movdqa	XMMWORD[176+rsp],xmm6
+
+$L$SEH_prolog_abi_test_trampoline_11:
+	movdqa	XMMWORD[192+rsp],xmm7
+
+$L$SEH_prolog_abi_test_trampoline_12:
+	movdqa	XMMWORD[208+rsp],xmm8
+
+$L$SEH_prolog_abi_test_trampoline_13:
+	movdqa	XMMWORD[224+rsp],xmm9
+
+$L$SEH_prolog_abi_test_trampoline_14:
+	movdqa	XMMWORD[240+rsp],xmm10
+
+$L$SEH_prolog_abi_test_trampoline_15:
+	movdqa	XMMWORD[256+rsp],xmm11
+
+$L$SEH_prolog_abi_test_trampoline_16:
+	movdqa	XMMWORD[272+rsp],xmm12
+
+$L$SEH_prolog_abi_test_trampoline_17:
+	movdqa	XMMWORD[288+rsp],xmm13
+
+$L$SEH_prolog_abi_test_trampoline_18:
+	movdqa	XMMWORD[304+rsp],xmm14
+
+$L$SEH_prolog_abi_test_trampoline_19:
+	movdqa	XMMWORD[320+rsp],xmm15
+
+$L$SEH_prolog_abi_test_trampoline_20:
+	mov	rbx,QWORD[rdx]
+	mov	rbp,QWORD[8+rdx]
+	mov	rdi,QWORD[16+rdx]
+	mov	rsi,QWORD[24+rdx]
+	mov	r12,QWORD[32+rdx]
+	mov	r13,QWORD[40+rdx]
+	mov	r14,QWORD[48+rdx]
+	mov	r15,QWORD[56+rdx]
+	movdqa	xmm6,XMMWORD[64+rdx]
+	movdqa	xmm7,XMMWORD[80+rdx]
+	movdqa	xmm8,XMMWORD[96+rdx]
+	movdqa	xmm9,XMMWORD[112+rdx]
+	movdqa	xmm10,XMMWORD[128+rdx]
+	movdqa	xmm11,XMMWORD[144+rdx]
+	movdqa	xmm12,XMMWORD[160+rdx]
+	movdqa	xmm13,XMMWORD[176+rdx]
+	movdqa	xmm14,XMMWORD[192+rdx]
+	movdqa	xmm15,XMMWORD[208+rdx]
+
+	mov	QWORD[88+rsp],rcx
+	mov	QWORD[96+rsp],rdx
+
+
+
+
+	mov	r10,r8
+	mov	r11,r9
+	dec	r11
+	js	NEAR $L$args_done
+	mov	rcx,QWORD[r10]
+	add	r10,8
+	dec	r11
+	js	NEAR $L$args_done
+	mov	rdx,QWORD[r10]
+	add	r10,8
+	dec	r11
+	js	NEAR $L$args_done
+	mov	r8,QWORD[r10]
+	add	r10,8
+	dec	r11
+	js	NEAR $L$args_done
+	mov	r9,QWORD[r10]
+	add	r10,8
+	lea	rax,[32+rsp]
+$L$args_loop:
+	dec	r11
+	js	NEAR $L$args_done
+
+
+
+
+
+
+	mov	QWORD[104+rsp],r11
+	mov	r11,QWORD[r10]
+	mov	QWORD[rax],r11
+	mov	r11,QWORD[104+rsp]
+
+	add	r10,8
+	add	rax,8
+	jmp	NEAR $L$args_loop
+
+$L$args_done:
+	mov	rax,QWORD[88+rsp]
+	mov	r10,QWORD[384+rsp]
+	test	r10,r10
+	jz	NEAR $L$no_unwind
+
+
+	pushfq
+	or	QWORD[rsp],0x100
+	popfq
+
+
+
+	nop
+global	abi_test_unwind_start
+abi_test_unwind_start:
+
+	call	rax
+global	abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+	pushfq
+	and	QWORD[rsp],-0x101
+	popfq
+global	abi_test_unwind_stop
+abi_test_unwind_stop:
+
+	jmp	NEAR $L$call_done
+
+$L$no_unwind:
+	call	rax
+
+$L$call_done:
+
+	mov	rdx,QWORD[96+rsp]
+	mov	QWORD[rdx],rbx
+	mov	QWORD[8+rdx],rbp
+	mov	QWORD[16+rdx],rdi
+	mov	QWORD[24+rdx],rsi
+	mov	QWORD[32+rdx],r12
+	mov	QWORD[40+rdx],r13
+	mov	QWORD[48+rdx],r14
+	mov	QWORD[56+rdx],r15
+	movdqa	XMMWORD[64+rdx],xmm6
+	movdqa	XMMWORD[80+rdx],xmm7
+	movdqa	XMMWORD[96+rdx],xmm8
+	movdqa	XMMWORD[112+rdx],xmm9
+	movdqa	XMMWORD[128+rdx],xmm10
+	movdqa	XMMWORD[144+rdx],xmm11
+	movdqa	XMMWORD[160+rdx],xmm12
+	movdqa	XMMWORD[176+rdx],xmm13
+	movdqa	XMMWORD[192+rdx],xmm14
+	movdqa	XMMWORD[208+rdx],xmm15
+	mov	rbx,QWORD[112+rsp]
+
+	mov	rbp,QWORD[120+rsp]
+
+	mov	rdi,QWORD[128+rsp]
+
+	mov	rsi,QWORD[136+rsp]
+
+	mov	r12,QWORD[144+rsp]
+
+	mov	r13,QWORD[152+rsp]
+
+	mov	r14,QWORD[160+rsp]
+
+	mov	r15,QWORD[168+rsp]
+
+	movdqa	xmm6,XMMWORD[176+rsp]
+
+	movdqa	xmm7,XMMWORD[192+rsp]
+
+	movdqa	xmm8,XMMWORD[208+rsp]
+
+	movdqa	xmm9,XMMWORD[224+rsp]
+
+	movdqa	xmm10,XMMWORD[240+rsp]
+
+	movdqa	xmm11,XMMWORD[256+rsp]
+
+	movdqa	xmm12,XMMWORD[272+rsp]
+
+	movdqa	xmm13,XMMWORD[288+rsp]
+
+	movdqa	xmm14,XMMWORD[304+rsp]
+
+	movdqa	xmm15,XMMWORD[320+rsp]
+
+	add	rsp,344
+
+
+
+	ret
+
+$L$SEH_end_abi_test_trampoline_21:
+
+
+global	abi_test_clobber_rax
+ALIGN	16
+abi_test_clobber_rax:
+_CET_ENDBR
+	xor	rax,rax
+	ret
+
+
+global	abi_test_clobber_rbx
+ALIGN	16
+abi_test_clobber_rbx:
+_CET_ENDBR
+	xor	rbx,rbx
+	ret
+
+
+global	abi_test_clobber_rcx
+ALIGN	16
+abi_test_clobber_rcx:
+_CET_ENDBR
+	xor	rcx,rcx
+	ret
+
+
+global	abi_test_clobber_rdx
+ALIGN	16
+abi_test_clobber_rdx:
+_CET_ENDBR
+	xor	rdx,rdx
+	ret
+
+
+global	abi_test_clobber_rdi
+ALIGN	16
+abi_test_clobber_rdi:
+_CET_ENDBR
+	xor	rdi,rdi
+	ret
+
+
+global	abi_test_clobber_rsi
+ALIGN	16
+abi_test_clobber_rsi:
+_CET_ENDBR
+	xor	rsi,rsi
+	ret
+
+
+global	abi_test_clobber_rbp
+ALIGN	16
+abi_test_clobber_rbp:
+_CET_ENDBR
+	xor	rbp,rbp
+	ret
+
+
+global	abi_test_clobber_r8
+ALIGN	16
+abi_test_clobber_r8:
+_CET_ENDBR
+	xor	r8,r8
+	ret
+
+
+global	abi_test_clobber_r9
+ALIGN	16
+abi_test_clobber_r9:
+_CET_ENDBR
+	xor	r9,r9
+	ret
+
+
+global	abi_test_clobber_r10
+ALIGN	16
+abi_test_clobber_r10:
+_CET_ENDBR
+	xor	r10,r10
+	ret
+
+
+global	abi_test_clobber_r11
+ALIGN	16
+abi_test_clobber_r11:
+_CET_ENDBR
+	xor	r11,r11
+	ret
+
+
+global	abi_test_clobber_r12
+ALIGN	16
+abi_test_clobber_r12:
+_CET_ENDBR
+	xor	r12,r12
+	ret
+
+
+global	abi_test_clobber_r13
+ALIGN	16
+abi_test_clobber_r13:
+_CET_ENDBR
+	xor	r13,r13
+	ret
+
+
+global	abi_test_clobber_r14
+ALIGN	16
+abi_test_clobber_r14:
+_CET_ENDBR
+	xor	r14,r14
+	ret
+
+
+global	abi_test_clobber_r15
+ALIGN	16
+abi_test_clobber_r15:
+_CET_ENDBR
+	xor	r15,r15
+	ret
+
+
+global	abi_test_clobber_xmm0
+ALIGN	16
+abi_test_clobber_xmm0:
+_CET_ENDBR
+	pxor	xmm0,xmm0
+	ret
+
+
+global	abi_test_clobber_xmm1
+ALIGN	16
+abi_test_clobber_xmm1:
+_CET_ENDBR
+	pxor	xmm1,xmm1
+	ret
+
+
+global	abi_test_clobber_xmm2
+ALIGN	16
+abi_test_clobber_xmm2:
+_CET_ENDBR
+	pxor	xmm2,xmm2
+	ret
+
+
+global	abi_test_clobber_xmm3
+ALIGN	16
+abi_test_clobber_xmm3:
+_CET_ENDBR
+	pxor	xmm3,xmm3
+	ret
+
+
+global	abi_test_clobber_xmm4
+ALIGN	16
+abi_test_clobber_xmm4:
+_CET_ENDBR
+	pxor	xmm4,xmm4
+	ret
+
+
+global	abi_test_clobber_xmm5
+ALIGN	16
+abi_test_clobber_xmm5:
+_CET_ENDBR
+	pxor	xmm5,xmm5
+	ret
+
+
+global	abi_test_clobber_xmm6
+ALIGN	16
+abi_test_clobber_xmm6:
+_CET_ENDBR
+	pxor	xmm6,xmm6
+	ret
+
+
+global	abi_test_clobber_xmm7
+ALIGN	16
+abi_test_clobber_xmm7:
+_CET_ENDBR
+	pxor	xmm7,xmm7
+	ret
+
+
+global	abi_test_clobber_xmm8
+ALIGN	16
+abi_test_clobber_xmm8:
+_CET_ENDBR
+	pxor	xmm8,xmm8
+	ret
+
+
+global	abi_test_clobber_xmm9
+ALIGN	16
+abi_test_clobber_xmm9:
+_CET_ENDBR
+	pxor	xmm9,xmm9
+	ret
+
+
+global	abi_test_clobber_xmm10
+ALIGN	16
+abi_test_clobber_xmm10:
+_CET_ENDBR
+	pxor	xmm10,xmm10
+	ret
+
+
+global	abi_test_clobber_xmm11
+ALIGN	16
+abi_test_clobber_xmm11:
+_CET_ENDBR
+	pxor	xmm11,xmm11
+	ret
+
+
+global	abi_test_clobber_xmm12
+ALIGN	16
+abi_test_clobber_xmm12:
+_CET_ENDBR
+	pxor	xmm12,xmm12
+	ret
+
+
+global	abi_test_clobber_xmm13
+ALIGN	16
+abi_test_clobber_xmm13:
+_CET_ENDBR
+	pxor	xmm13,xmm13
+	ret
+
+
+global	abi_test_clobber_xmm14
+ALIGN	16
+abi_test_clobber_xmm14:
+_CET_ENDBR
+	pxor	xmm14,xmm14
+	ret
+
+
+global	abi_test_clobber_xmm15
+ALIGN	16
+abi_test_clobber_xmm15:
+_CET_ENDBR
+	pxor	xmm15,xmm15
+	ret
+
+
+
+
+
+global	abi_test_bad_unwind_wrong_register
+ALIGN	16
+abi_test_bad_unwind_wrong_register:
+
+$L$SEH_begin_abi_test_bad_unwind_wrong_register_1:
+_CET_ENDBR
+	push	r12
+
+$L$SEH_prolog_abi_test_bad_unwind_wrong_register_2:
+
+
+
+	nop
+	pop	r12
+
+	ret
+$L$SEH_end_abi_test_bad_unwind_wrong_register_3:
+
+
+
+
+
+
+
+global	abi_test_bad_unwind_temporary
+ALIGN	16
+abi_test_bad_unwind_temporary:
+
+$L$SEH_begin_abi_test_bad_unwind_temporary_1:
+_CET_ENDBR
+	push	r12
+
+$L$SEH_prolog_abi_test_bad_unwind_temporary_2:
+
+	mov	rax,r12
+	inc	rax
+	mov	QWORD[rsp],rax
+
+
+
+	mov	QWORD[rsp],r12
+
+
+	pop	r12
+
+	ret
+
+$L$SEH_end_abi_test_bad_unwind_temporary_3:
+
+
+
+
+
+
+global	abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+	pushfq
+	pop	rax
+	and	rax,0x400
+	shr	rax,10
+	cld
+	ret
+
+
+
+
+
+global	abi_test_set_direction_flag
+abi_test_set_direction_flag:
+_CET_ENDBR
+	std
+	ret
+
+
+
+
+
+
+global	abi_test_bad_unwind_epilog
+ALIGN	16
+abi_test_bad_unwind_epilog:
+$L$SEH_begin_abi_test_bad_unwind_epilog_1:
+	push	r12
+$L$SEH_prolog_abi_test_bad_unwind_epilog_2:
+
+	nop
+
+
+	pop	r12
+	nop
+	ret
+$L$SEH_end_abi_test_bad_unwind_epilog_3:
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_abi_test_trampoline_1 wrt ..imagebase
+	DD	$L$SEH_end_abi_test_trampoline_21 wrt ..imagebase
+	DD	$L$SEH_info_abi_test_trampoline_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_abi_test_bad_unwind_wrong_register_1 wrt ..imagebase
+	DD	$L$SEH_end_abi_test_bad_unwind_wrong_register_3 wrt ..imagebase
+	DD	$L$SEH_info_abi_test_bad_unwind_wrong_register_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_abi_test_bad_unwind_temporary_1 wrt ..imagebase
+	DD	$L$SEH_end_abi_test_bad_unwind_temporary_3 wrt ..imagebase
+	DD	$L$SEH_info_abi_test_bad_unwind_temporary_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_abi_test_bad_unwind_epilog_1 wrt ..imagebase
+	DD	$L$SEH_end_abi_test_bad_unwind_epilog_3 wrt ..imagebase
+	DD	$L$SEH_info_abi_test_bad_unwind_epilog_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_abi_test_trampoline_0:
+	DB	1
+	DB	$L$SEH_prolog_abi_test_trampoline_20-$L$SEH_begin_abi_test_trampoline_1
+	DB	38
+	DB	0
+	DB	$L$SEH_prolog_abi_test_trampoline_20-$L$SEH_begin_abi_test_trampoline_1
+	DB	248
+	DW	20
+	DB	$L$SEH_prolog_abi_test_trampoline_19-$L$SEH_begin_abi_test_trampoline_1
+	DB	232
+	DW	19
+	DB	$L$SEH_prolog_abi_test_trampoline_18-$L$SEH_begin_abi_test_trampoline_1
+	DB	216
+	DW	18
+	DB	$L$SEH_prolog_abi_test_trampoline_17-$L$SEH_begin_abi_test_trampoline_1
+	DB	200
+	DW	17
+	DB	$L$SEH_prolog_abi_test_trampoline_16-$L$SEH_begin_abi_test_trampoline_1
+	DB	184
+	DW	16
+	DB	$L$SEH_prolog_abi_test_trampoline_15-$L$SEH_begin_abi_test_trampoline_1
+	DB	168
+	DW	15
+	DB	$L$SEH_prolog_abi_test_trampoline_14-$L$SEH_begin_abi_test_trampoline_1
+	DB	152
+	DW	14
+	DB	$L$SEH_prolog_abi_test_trampoline_13-$L$SEH_begin_abi_test_trampoline_1
+	DB	136
+	DW	13
+	DB	$L$SEH_prolog_abi_test_trampoline_12-$L$SEH_begin_abi_test_trampoline_1
+	DB	120
+	DW	12
+	DB	$L$SEH_prolog_abi_test_trampoline_11-$L$SEH_begin_abi_test_trampoline_1
+	DB	104
+	DW	11
+	DB	$L$SEH_prolog_abi_test_trampoline_10-$L$SEH_begin_abi_test_trampoline_1
+	DB	244
+	DW	21
+	DB	$L$SEH_prolog_abi_test_trampoline_9-$L$SEH_begin_abi_test_trampoline_1
+	DB	228
+	DW	20
+	DB	$L$SEH_prolog_abi_test_trampoline_8-$L$SEH_begin_abi_test_trampoline_1
+	DB	212
+	DW	19
+	DB	$L$SEH_prolog_abi_test_trampoline_7-$L$SEH_begin_abi_test_trampoline_1
+	DB	196
+	DW	18
+	DB	$L$SEH_prolog_abi_test_trampoline_6-$L$SEH_begin_abi_test_trampoline_1
+	DB	100
+	DW	17
+	DB	$L$SEH_prolog_abi_test_trampoline_5-$L$SEH_begin_abi_test_trampoline_1
+	DB	116
+	DW	16
+	DB	$L$SEH_prolog_abi_test_trampoline_4-$L$SEH_begin_abi_test_trampoline_1
+	DB	84
+	DW	15
+	DB	$L$SEH_prolog_abi_test_trampoline_3-$L$SEH_begin_abi_test_trampoline_1
+	DB	52
+	DW	14
+	DB	$L$SEH_prolog_abi_test_trampoline_2-$L$SEH_begin_abi_test_trampoline_1
+	DB	1
+	DW	43
+
+$L$SEH_info_abi_test_bad_unwind_wrong_register_0:
+	DB	1
+	DB	$L$SEH_prolog_abi_test_bad_unwind_wrong_register_2-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prolog_abi_test_bad_unwind_wrong_register_2-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1
+	DB	208
+
+$L$SEH_info_abi_test_bad_unwind_temporary_0:
+	DB	1
+	DB	$L$SEH_prolog_abi_test_bad_unwind_temporary_2-$L$SEH_begin_abi_test_bad_unwind_temporary_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prolog_abi_test_bad_unwind_temporary_2-$L$SEH_begin_abi_test_bad_unwind_temporary_1
+	DB	192
+
+$L$SEH_info_abi_test_bad_unwind_epilog_0:
+	DB	1
+	DB	$L$SEH_prolog_abi_test_bad_unwind_epilog_2-$L$SEH_begin_abi_test_bad_unwind_epilog_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prolog_abi_test_bad_unwind_epilog_2-$L$SEH_begin_abi_test_bad_unwind_epilog_1
+	DB	192
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/sources.cmake b/sources.cmake
index 94e1f5f..9f9d373 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -1,7 +1,7 @@
 # This file contains source lists that are also consumed by
 # generate_build_files.py.
 #
-# TODO(davidben): Move the other source lists into this file.
+# TODO(crbug.com/boringssl/542): Move everything here into util/pregenerate.
 
 set(
   CRYPTO_TEST_SOURCES
diff --git a/util/generate_build_files.py b/util/generate_build_files.py
index 1b34dc2..d564a17 100644
--- a/util/generate_build_files.py
+++ b/util/generate_build_files.py
@@ -23,46 +23,6 @@
 import json
 
 
-# OS_ARCH_COMBOS maps from OS and platform to the OpenSSL assembly "style" for
-# that platform and the extension used by asm files.
-#
-# TODO(https://crbug.com/boringssl/542): This probably should be a map, but some
-# downstream scripts import this to find what folders to add/remove from git.
-OS_ARCH_COMBOS = [
-    ('apple', 'aarch64', 'ios64', [], 'S'),
-    ('apple', 'x86', 'macosx', ['-fPIC'], 'S'),
-    ('apple', 'x86_64', 'macosx', [], 'S'),
-    ('linux', 'arm', 'linux32', [], 'S'),
-    ('linux', 'aarch64', 'linux64', [], 'S'),
-    ('linux', 'x86', 'elf', ['-fPIC'], 'S'),
-    ('linux', 'x86_64', 'elf', [], 'S'),
-    ('win', 'x86', 'win32n', [], 'asm'),
-    ('win', 'x86_64', 'nasm', [], 'asm'),
-    ('win', 'aarch64', 'win64', [], 'S'),
-]
-
-# NON_PERL_FILES enumerates assembly files that are not processed by the
-# perlasm system.
-NON_PERL_FILES = {
-    ('apple', 'x86_64'): [
-        'src/third_party/fiat/asm/fiat_curve25519_adx_mul.S',
-        'src/third_party/fiat/asm/fiat_curve25519_adx_square.S',
-        'src/third_party/fiat/asm/fiat_p256_adx_mul.S',
-        'src/third_party/fiat/asm/fiat_p256_adx_sqr.S',
-    ],
-    ('linux', 'arm'): [
-        'src/crypto/curve25519/asm/x25519-asm-arm.S',
-        'src/crypto/poly1305/poly1305_arm_asm.S',
-    ],
-    ('linux', 'x86_64'): [
-        'src/crypto/hrss/asm/poly_rq_mul.S',
-        'src/third_party/fiat/asm/fiat_curve25519_adx_mul.S',
-        'src/third_party/fiat/asm/fiat_curve25519_adx_square.S',
-        'src/third_party/fiat/asm/fiat_p256_adx_mul.S',
-        'src/third_party/fiat/asm/fiat_p256_adx_sqr.S',
-    ],
-}
-
 PREFIX = None
 EMBED_TEST_DATA = True
 
@@ -569,17 +529,6 @@
     with open('sources.json', 'w+') as f:
       json.dump(files, f, sort_keys=True, indent=2)
 
-def FindCMakeFiles(directory):
-  """Returns list of all CMakeLists.txt files recursively in directory."""
-  cmakefiles = []
-
-  for (path, _, filenames) in os.walk(directory):
-    for filename in filenames:
-      if filename == 'CMakeLists.txt':
-        cmakefiles.append(os.path.join(path, filename))
-
-  return cmakefiles
-
 def OnlyFIPSFragments(path, dent, is_dir):
   return is_dir or (path.startswith(
       os.path.join('src', 'crypto', 'fipsmodule', '')) and
@@ -679,85 +628,6 @@
   return hfiles
 
 
-def ExtractPerlAsmFromCMakeFile(cmakefile):
-  """Parses the contents of the CMakeLists.txt file passed as an argument and
-  returns a list of all the perlasm() directives found in the file."""
-  perlasms = []
-  with open(cmakefile) as f:
-    for line in f:
-      line = line.strip()
-      if not line.startswith('perlasm('):
-        continue
-      if not line.endswith(')'):
-        raise ValueError('Bad perlasm line in %s' % cmakefile)
-      # Remove "perlasm(" from start and ")" from end
-      params = line[8:-1].split()
-      if len(params) < 4:
-        raise ValueError('Bad perlasm line in %s' % cmakefile)
-      perlasms.append({
-          'arch': params[1],
-          'output': os.path.join(os.path.dirname(cmakefile), params[2]),
-          'input': os.path.join(os.path.dirname(cmakefile), params[3]),
-          'extra_args': params[4:],
-      })
-
-  return perlasms
-
-
-def ReadPerlAsmOperations():
-  """Returns a list of all perlasm() directives found in CMake config files in
-  src/."""
-  perlasms = []
-  cmakefiles = FindCMakeFiles('src')
-
-  for cmakefile in cmakefiles:
-    perlasms.extend(ExtractPerlAsmFromCMakeFile(cmakefile))
-
-  return perlasms
-
-
-def PerlAsm(output_filename, input_filename, perlasm_style, extra_args):
-  """Runs the a perlasm script and puts the output into output_filename."""
-  base_dir = os.path.dirname(output_filename)
-  if not os.path.isdir(base_dir):
-    os.makedirs(base_dir)
-  subprocess.check_call(
-      ['perl', input_filename, perlasm_style] + extra_args + [output_filename])
-
-
-def WriteAsmFiles(perlasms):
-  """Generates asm files from perlasm directives for each supported OS x
-  platform combination."""
-  asmfiles = {}
-
-  for perlasm in perlasms:
-    for (osname, arch, perlasm_style, extra_args, asm_ext) in OS_ARCH_COMBOS:
-      if arch != perlasm['arch']:
-        continue
-      # TODO(https://crbug.com/boringssl/542): Now that we incorporate osname in
-      # the output filename, the asm files can just go in a single directory.
-      # For now, we keep them in target-specific directories to avoid breaking
-      # downstream scripts.
-      key = (osname, arch)
-      outDir = '%s-%s' % key
-      output = perlasm['output']
-      if not output.startswith('src'):
-        raise ValueError('output missing src: %s' % output)
-      output = os.path.join(outDir, output[4:])
-      output = '%s-%s.%s' % (output, osname, asm_ext)
-      PerlAsm(output, perlasm['input'], perlasm_style,
-              extra_args + perlasm['extra_args'])
-      asmfiles.setdefault(key, []).append(output)
-
-  for (key, non_perl_asm_files) in NON_PERL_FILES.items():
-    asmfiles.setdefault(key, []).extend(non_perl_asm_files)
-
-  for files in asmfiles.values():
-    files.sort()
-
-  return asmfiles
-
-
 def ExtractVariablesFromCMakeFile(cmakefile):
   """Parses the contents of the CMakeLists.txt file passed as an argument and
   returns a dictionary of exported source lists."""
@@ -792,7 +662,12 @@
 
 
 def main(platforms):
+  # TODO(crbug.com/boringssl/542): Move everything to util/pregenerate and the
+  # new JSON file.
   cmake = ExtractVariablesFromCMakeFile(os.path.join('src', 'sources.cmake'))
+  with open(os.path.join('src', 'gen', 'sources.json')) as f:
+    sources = json.load(f)
+
   crypto_c_files = (FindCFiles(os.path.join('src', 'crypto'), NoTestsNorFIPSFragments) +
                     FindCFiles(os.path.join('src', 'third_party', 'fiat'), NoTestsNorFIPSFragments))
   fips_fragments = FindCFiles(os.path.join('src', 'crypto', 'fipsmodule'), OnlyFIPSFragments)
@@ -805,12 +680,7 @@
       os.path.join('src', 'crypto', 'fipsmodule', 'bcm.c')
   ]
 
-  # Generate err_data.c
-  with open('err_data.c', 'w+') as err_data:
-    subprocess.check_call(['go', 'run', 'err_data_generate.go'],
-                          cwd=os.path.join('src', 'crypto', 'err'),
-                          stdout=err_data)
-  crypto_c_files.append('err_data.c')
+  crypto_c_files += PrefixWithSrc(sources['crypto']['srcs'])
   crypto_c_files.sort()
 
   test_support_h_files = (
@@ -847,28 +717,19 @@
       FindHeaderFiles(os.path.join('src', 'crypto'), NoTests) +
       FindHeaderFiles(os.path.join('src', 'third_party', 'fiat'), NoTests))
 
-  asm_outputs = sorted(WriteAsmFiles(ReadPerlAsmOperations()).items())
-
-  # Generate combined source lists for gas and nasm. Some files appear in
-  # multiple per-platform lists, so we de-duplicate.
-  #
-  # TODO(https://crbug.com/boringssl/542): It would be simpler to build the
-  # combined source lists directly. This is a remnant of the previous assembly
-  # strategy. When we move to pre-generated assembly files, this will be
-  # removed.
-  asm_sources = set()
-  nasm_sources = set()
-  for ((osname, arch), asm_files) in asm_outputs:
-    if (osname, arch) in (('win', 'x86'), ('win', 'x86_64')):
-      nasm_sources.update(asm_files)
-    else:
-      asm_sources.update(asm_files)
+  # TODO(crbug.com/boringssl/542): generate_build_files.py historically reported
+  # all the assembly files as part of libcrypto. Merge them for now, but we
+  # should split them out later.
+  crypto_asm = sorted(sources['bcm']['asm'] + sources['crypto']['asm'] +
+                      sources['test_support']['asm'])
+  crypto_nasm = sorted(sources['bcm']['nasm'] + sources['crypto']['nasm'] +
+                       sources['test_support']['nasm'])
 
   files = {
       'bcm_crypto': bcm_crypto_c_files,
       'crypto': crypto_c_files,
-      'crypto_asm': sorted(list(asm_sources)),
-      'crypto_nasm': sorted(list(nasm_sources)),
+      'crypto_asm': PrefixWithSrc(crypto_asm),
+      'crypto_nasm': PrefixWithSrc(crypto_nasm),
       'crypto_headers': crypto_h_files,
       'crypto_internal_headers': crypto_internal_h_files,
       'crypto_test': crypto_test_files,
diff --git a/util/pregenerate/build.go b/util/pregenerate/build.go
new file mode 100644
index 0000000..5f60960
--- /dev/null
+++ b/util/pregenerate/build.go
@@ -0,0 +1,284 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package main
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/json"
+	"fmt"
+	"path"
+	"slices"
+	"strings"
+)
+
+// An OutputTarget is a build target for consumption by the downstream build
+// systems. All pre-generated files are baked input its source lists.
+type OutputTarget struct {
+	// Srcs is the list of C or C++ files (determined by file extension) that are
+	// built into the target.
+	Srcs []string `json:"srcs,omitempty"`
+	// Hdrs is the list public headers that should be available to external
+	// projects using this target.
+	Hdrs []string `json:"hdrs,omitempty"`
+	// InternalHdrs is the list of internal headers that should be available to
+	// this target, as well as any internal targets using this target.
+	InternalHdrs []string `json:"internal_hdrs,omitempty"`
+	// Asm is the a list of assembly files to be passed to a gas-compatible
+	// assembler.
+	Asm []string `json:"asm,omitempty"`
+	// Nasm is the a list of assembly files to be passed to a nasm-compatible
+	// assembler.
+	Nasm []string `json:"nasm,omitempty"`
+	// Data is a list of test data files that should be available when the test is
+	// run.
+	Data []string `json:"data,omitempty"`
+}
+
+// An InputTarget is a build target with build inputs that still need to be
+// pregenerated.
+type InputTarget struct {
+	OutputTarget
+	// ErrData contains a list of errordata files to combine into err_data.c.
+	ErrData []string `json:"err_data,omitempty"`
+	// The following fields define perlasm sources for the corresponding
+	// architecture.
+	PerlasmAarch64 []PerlasmSource `json:"perlasm_aarch64,omitempty"`
+	PerlasmArm     []PerlasmSource `json:"perlasm_arm,omitempty"`
+	PerlasmX86     []PerlasmSource `json:"perlasm_x86,omitempty"`
+	PerlasmX86_64  []PerlasmSource `json:"perlasm_x86_64,omitempty"`
+}
+
+type PerlasmSource struct {
+	// Src the path to the input perlasm file.
+	Src string `json:"src"`
+	// Dst, if not empty, is base name of the destination file. If empty, this
+	// is determined from Src by default. It should be overriden if a single
+	// source file generates multiple functions (e.g. SHA-256 vs SHA-512) or
+	// multiple architectures (e.g. the "armx" files).
+	Dst string `json:"dst,omitempty"`
+	// Args is a list of extra parameters to pass to the script.
+	Args []string `json:"args,omitempty"`
+}
+
+// Pregenerate converts an input target to an output target. It returns the
+// result alongside a list of tasks that must be run to build the referenced
+// files.
+func (in *InputTarget) Pregenerate(name string) (out OutputTarget, tasks []Task) {
+	out = in.OutputTarget
+
+	// Make copies of any fields we will write to.
+	out.Srcs = slices.Clone(out.Srcs)
+	out.Asm = slices.Clone(out.Asm)
+	out.Nasm = slices.Clone(out.Nasm)
+
+	addTask := func(list *[]string, t Task) {
+		tasks = append(tasks, t)
+		*list = append(*list, t.Destination())
+	}
+
+	if len(in.ErrData) != 0 {
+		addTask(&out.Srcs, &ErrDataTask{TargetName: name, Inputs: in.ErrData})
+	}
+
+	addPerlasmTask := func(list *[]string, p *PerlasmSource, fileSuffix string, args []string) {
+		dst := p.Dst
+		if len(p.Dst) == 0 {
+			dst = strings.TrimSuffix(path.Base(p.Src), ".pl")
+		}
+		dst = path.Join("gen", name, dst+fileSuffix)
+		args = append(slices.Clone(args), p.Args...)
+		addTask(list, &PerlasmTask{Src: p.Src, Dst: dst, Args: args})
+	}
+
+	for _, p := range in.PerlasmAarch64 {
+		addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"ios64"})
+		addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux64"})
+		addPerlasmTask(&out.Asm, &p, "-win.S", []string{"win64"})
+	}
+	for _, p := range in.PerlasmArm {
+		addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux32"})
+	}
+	for _, p := range in.PerlasmX86 {
+		addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"macosx", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+		addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"elf", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+		addPerlasmTask(&out.Nasm, &p, "-win.asm", []string{"win32n", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+	}
+	for _, p := range in.PerlasmX86_64 {
+		addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"macosx"})
+		addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"elf"})
+		addPerlasmTask(&out.Nasm, &p, "-win.asm", []string{"nasm"})
+	}
+
+	// Re-sort the modified fields.
+	slices.Sort(out.Srcs)
+	slices.Sort(out.Asm)
+	slices.Sort(out.Nasm)
+
+	return
+}
+
+func sortedKeys[K cmp.Ordered, V any](m map[K]V) []K {
+	keys := make([]K, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	slices.Sort(keys)
+	return keys
+}
+
+func writeHeader(b *bytes.Buffer, comment string) {
+	fmt.Fprintf(b, "%s Copyright (c) 2024, Google Inc.\n", comment)
+	fmt.Fprintf(b, "%s\n", comment)
+	fmt.Fprintf(b, "%s Permission to use, copy, modify, and/or distribute this software for any\n", comment)
+	fmt.Fprintf(b, "%s purpose with or without fee is hereby granted, provided that the above\n", comment)
+	fmt.Fprintf(b, "%s copyright notice and this permission notice appear in all copies.\n", comment)
+	fmt.Fprintf(b, "%s\n", comment)
+	fmt.Fprintf(b, "%s THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES\n", comment)
+	fmt.Fprintf(b, "%s WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF\n", comment)
+	fmt.Fprintf(b, "%s MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY\n", comment)
+	fmt.Fprintf(b, "%s SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n", comment)
+	fmt.Fprintf(b, "%s WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION\n", comment)
+	fmt.Fprintf(b, "%s OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN\n", comment)
+	fmt.Fprintf(b, "%s CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n", comment)
+	fmt.Fprintf(b, "%s\n", comment)
+	fmt.Fprintf(b, "%s Generated by go ./util/pregenerate. Do not edit manually.\n", comment)
+}
+
+func buildVariablesTask(targets map[string]OutputTarget, dst, comment string, writeVariable func(b *bytes.Buffer, name string, val []string)) Task {
+	return NewSimpleTask(dst, func() ([]byte, error) {
+		var b bytes.Buffer
+		writeHeader(&b, comment)
+
+		for _, name := range sortedKeys(targets) {
+			target := targets[name]
+			if len(target.Srcs) != 0 {
+				writeVariable(&b, name+"_sources", target.Srcs)
+			}
+			if len(target.Hdrs) != 0 {
+				writeVariable(&b, name+"_headers", target.Hdrs)
+			}
+			if len(target.InternalHdrs) != 0 {
+				writeVariable(&b, name+"_internal_headers", target.InternalHdrs)
+			}
+			if len(target.Asm) != 0 {
+				writeVariable(&b, name+"_sources_asm", target.Asm)
+			}
+			if len(target.Nasm) != 0 {
+				writeVariable(&b, name+"_sources_nasm", target.Nasm)
+			}
+			if len(target.Data) != 0 {
+				writeVariable(&b, name+"_data", target.Data)
+			}
+		}
+
+		return b.Bytes(), nil
+	})
+}
+
+func writeBazelVariable(b *bytes.Buffer, name string, val []string) {
+	fmt.Fprintf(b, "\n%s = [\n", name)
+	for _, v := range val {
+		fmt.Fprintf(b, "  %q,\n", v)
+	}
+	fmt.Fprintf(b, "]\n")
+}
+
+func writeCMakeVariable(b *bytes.Buffer, name string, val []string) {
+	fmt.Fprintf(b, "\nset(\n")
+	fmt.Fprintf(b, "  %s\n\n", strings.ToUpper(name))
+	for _, v := range val {
+		fmt.Fprintf(b, "  %s\n", v)
+	}
+	fmt.Fprintf(b, ")\n")
+}
+
+func writeMakeVariable(b *bytes.Buffer, name string, val []string) {
+	fmt.Fprintf(b, "\n%s := \\\n", name)
+	for i, v := range val {
+		if i == len(val)-1 {
+			fmt.Fprintf(b, "  %s\n", v)
+		} else {
+			fmt.Fprintf(b, "  %s \\\n", v)
+		}
+	}
+}
+
+func writeGNVariable(b *bytes.Buffer, name string, val []string) {
+	// Bazel and GN have the same syntax similar syntax.
+	writeBazelVariable(b, name, val)
+}
+
+func jsonTask(targets map[string]OutputTarget, dst string) Task {
+	return NewSimpleTask(dst, func() ([]byte, error) {
+		return json.MarshalIndent(targets, "", "  ")
+	})
+}
+
+func soongTask(targets map[string]OutputTarget, dst string) Task {
+	return NewSimpleTask(dst, func() ([]byte, error) {
+		var b bytes.Buffer
+		writeHeader(&b, "//")
+
+		writeAttribute := func(indent, name string, val []string) {
+			fmt.Fprintf(&b, "%s%s: [\n", indent, name)
+			for _, v := range val {
+				fmt.Fprintf(&b, "%s    %q,\n", indent, v)
+			}
+			fmt.Fprintf(&b, "%s],\n", indent)
+
+		}
+
+		for _, name := range sortedKeys(targets) {
+			target := targets[name]
+			fmt.Fprintf(&b, "\ncc_defaults {\n")
+			fmt.Fprintf(&b, "    name: %q\n", "boringssl_"+name+"_sources")
+			if len(target.Srcs) != 0 {
+				writeAttribute("    ", "srcs", target.Srcs)
+			}
+			if len(target.Data) != 0 {
+				writeAttribute("    ", "data", target.Data)
+			}
+			if len(target.Asm) != 0 {
+				fmt.Fprintf(&b, "    target: {\n")
+				// Only emit asm for Linux. On Windows, BoringSSL requires NASM, which is
+				// not available in AOSP. On Darwin, the assembly works fine, but it
+				// conflicts with Android's FIPS build. See b/294399371.
+				fmt.Fprintf(&b, "        linux: {\n")
+				writeAttribute("            ", "srcs", target.Asm)
+				fmt.Fprintf(&b, "        },\n")
+				fmt.Fprintf(&b, "        darwin: {\n")
+				fmt.Fprintf(&b, "            cflags: [\"-DOPENSSL_NO_ASM\"],\n")
+				fmt.Fprintf(&b, "        },\n")
+				fmt.Fprintf(&b, "        windows: {\n")
+				fmt.Fprintf(&b, "            cflags: [\"-DOPENSSL_NO_ASM\"],\n")
+				fmt.Fprintf(&b, "        },\n")
+				fmt.Fprintf(&b, "    },\n")
+			}
+			fmt.Fprintf(&b, "},\n")
+		}
+
+		return b.Bytes(), nil
+	})
+}
+
+func MakeBuildFiles(targets map[string]OutputTarget) []Task {
+	// TODO(crbug.com/boringssl/542): Generate the build files for the other
+	// types as well.
+	return []Task{
+		buildVariablesTask(targets, "gen/sources.cmake", "#", writeCMakeVariable),
+		jsonTask(targets, "gen/sources.json"),
+	}
+}
diff --git a/crypto/err/err_data_generate.go b/util/pregenerate/err_data.go
similarity index 85%
rename from crypto/err/err_data_generate.go
rename to util/pregenerate/err_data.go
index d4a7c28..8d89d99 100644
--- a/crypto/err/err_data_generate.go
+++ b/util/pregenerate/err_data.go
@@ -12,25 +12,20 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-//go:build ignore
-
 package main
 
 import (
 	"bufio"
 	"bytes"
 	"errors"
-	"flag"
 	"fmt"
 	"io"
 	"os"
+	"path"
 	"sort"
 	"strconv"
-	"strings"
 )
 
-var verbose = flag.Bool("verbose", false, "If true, prints a status message at the end.")
-
 // libraryNames must be kept in sync with the enum in err.h. The generated code
 // will contain static assertions to enforce this.
 var libraryNames = []string{
@@ -129,10 +124,6 @@
 
 func (st *stringList) WriteTo(out stringWriter, name string) {
 	list := st.buildList()
-	if *verbose {
-		fmt.Fprintf(os.Stderr, "%s: %d bytes of list and %d bytes of string data.\n", name, 4*len(list), len(st.stringData))
-	}
-
 	values := "kOpenSSL" + name + "Values"
 	out.WriteString("const uint32_t " + values + "[] = {\n")
 	for _, v := range list {
@@ -207,9 +198,16 @@
 	return scanner.Err()
 }
 
-func main() {
-	flag.Parse()
+type ErrDataTask struct {
+	TargetName string
+	Inputs     []string
+}
 
+func (t *ErrDataTask) Destination() string {
+	return path.Join("gen", t.TargetName, "err_data.c")
+}
+
+func (t *ErrDataTask) Run() ([]byte, error) {
 	e := &errorData{
 		reasons:    newStringList(),
 		libraryMap: make(map[string]uint32),
@@ -218,27 +216,13 @@
 		e.libraryMap[name] = uint32(i) + 1
 	}
 
-	cwd, err := os.Open(".")
-	if err != nil {
-		panic(err)
-	}
-	names, err := cwd.Readdirnames(-1)
-	if err != nil {
-		panic(err)
-	}
-
-	sort.Strings(names)
-	for _, name := range names {
-		if !strings.HasSuffix(name, ".errordata") {
-			continue
-		}
-		if err := e.readErrorDataFile(name); err != nil {
-			panic(err)
+	for _, input := range t.Inputs {
+		if err := e.readErrorDataFile(input); err != nil {
+			return nil, err
 		}
 	}
 
-	out := os.Stdout
-
+	var out bytes.Buffer
 	out.WriteString(`/* Copyright (c) 2015, Google Inc.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
@@ -253,7 +237,7 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
- /* This file was generated by err_data_generate.go. */
+ /* This file was generated by go run ./util/pregenerate. */
 
 #include <openssl/base.h>
 #include <openssl/err.h>
@@ -263,10 +247,11 @@
 `)
 
 	for i, name := range libraryNames {
-		fmt.Fprintf(out, "static_assert(ERR_LIB_%s == %d, \"library value changed\");\n", name, i+1)
+		fmt.Fprintf(&out, "static_assert(ERR_LIB_%s == %d, \"library value changed\");\n", name, i+1)
 	}
-	fmt.Fprintf(out, "static_assert(ERR_NUM_LIBS == %d, \"number of libraries changed\");\n", len(libraryNames)+1)
+	fmt.Fprintf(&out, "static_assert(ERR_NUM_LIBS == %d, \"number of libraries changed\");\n", len(libraryNames)+1)
 	out.WriteString("\n")
 
-	e.reasons.WriteTo(out, "Reason")
+	e.reasons.WriteTo(&out, "Reason")
+	return out.Bytes(), nil
 }
diff --git a/util/pregenerate/pregenerate.go b/util/pregenerate/pregenerate.go
new file mode 100644
index 0000000..ba062c7
--- /dev/null
+++ b/util/pregenerate/pregenerate.go
@@ -0,0 +1,218 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// pregenerate manages generated files in BoringSSL
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+	"sync"
+)
+
+var (
+	check      = flag.Bool("check", false, "Check whether any files need to be updated, without actually updating them")
+	numWorkers = flag.Int("num-workers", runtime.NumCPU(), "Runs the given number of workers")
+	dryRun     = flag.Bool("dry-run", false, "Skip actually writing any files")
+	perlPath   = flag.String("perl", "perl", "Path to the perl command")
+	list       = flag.Bool("list", false, "List all generated files, rather than actually run them")
+)
+
+func runTask(t Task) error {
+	expected, err := t.Run()
+	if err != nil {
+		return err
+	}
+
+	dst := t.Destination()
+	dstPath := filepath.FromSlash(dst)
+	if *check {
+		actual, err := os.ReadFile(dstPath)
+		if err != nil {
+			if os.IsNotExist(err) {
+				err = errors.New("missing file")
+			}
+			return err
+		}
+
+		if !bytes.Equal(expected, actual) {
+			return errors.New("file out of date")
+		}
+		return nil
+	}
+
+	if *dryRun {
+		fmt.Printf("Would write %d bytes to %q\n", len(expected), dst)
+		return nil
+	}
+
+	if err := os.MkdirAll(filepath.Dir(dstPath), 0777); err != nil {
+		return err
+	}
+	return os.WriteFile(dstPath, expected, 0666)
+}
+
+type taskError struct {
+	dst string
+	err error
+}
+
+func worker(taskChan <-chan Task, errorChan chan<- taskError, wg *sync.WaitGroup) {
+	defer wg.Done()
+	for t := range taskChan {
+		if err := runTask(t); err != nil {
+			errorChan <- taskError{t.Destination(), err}
+		}
+	}
+}
+
+func run() error {
+	if _, err := os.Stat("BUILDING.md"); err != nil {
+		return fmt.Errorf("must be run from BoringSSL source root")
+	}
+
+	buildJSON, err := os.ReadFile("build.json")
+	if err != nil {
+		return err
+	}
+
+	// Remove comments. For now, just do a very basic preprocessing step. If
+	// needed, we can switch to something well-defined like one of the many
+	// dozen different extended JSONs like JSON5.
+	lines := bytes.Split(buildJSON, []byte("\n"))
+	for i := range lines {
+		if idx := bytes.Index(lines[i], []byte("//")); idx >= 0 {
+			lines[i] = lines[i][:idx]
+		}
+	}
+	buildJSON = bytes.Join(lines, []byte("\n"))
+
+	var targetsIn map[string]InputTarget
+	if err := json.Unmarshal(buildJSON, &targetsIn); err != nil {
+		return fmt.Errorf("error decoding build config: %s", err)
+	}
+
+	var tasks []Task
+	targetsOut := make(map[string]OutputTarget)
+	for name, targetIn := range targetsIn {
+		targetOut, targetTasks := targetIn.Pregenerate(name)
+		targetsOut[name] = targetOut
+		tasks = append(tasks, targetTasks...)
+	}
+
+	tasks = append(tasks, MakeBuildFiles(targetsOut)...)
+	tasks = append(tasks, NewSimpleTask("gen/README.md", func() ([]byte, error) {
+		return []byte(readme), nil
+	}))
+
+	// Filter tasks by command-line argument.
+	if args := flag.Args(); len(args) != 0 {
+		var filtered []Task
+		for _, t := range tasks {
+			dst := t.Destination()
+			for _, arg := range args {
+				if strings.Contains(dst, arg) {
+					filtered = append(filtered, t)
+					break
+				}
+			}
+		}
+		tasks = filtered
+	}
+
+	if *list {
+		paths := make([]string, len(tasks))
+		for i, t := range tasks {
+			paths[i] = t.Destination()
+		}
+		slices.Sort(paths)
+		for _, p := range paths {
+			fmt.Println(p)
+		}
+		return nil
+	}
+
+	// Schedule tasks in parallel. Perlasm benefits from running in parallel. The
+	// others likely do not, but it is simpler to parallelize them all.
+	var wg sync.WaitGroup
+	taskChan := make(chan Task, *numWorkers)
+	errorChan := make(chan taskError, *numWorkers)
+	for i := 0; i < *numWorkers; i++ {
+		wg.Add(1)
+		go worker(taskChan, errorChan, &wg)
+	}
+
+	go func() {
+		for _, t := range tasks {
+			taskChan <- t
+		}
+		close(taskChan)
+		wg.Wait()
+		close(errorChan)
+	}()
+
+	var failed bool
+	for err := range errorChan {
+		fmt.Fprintf(os.Stderr, "Error in file %q: %s\n", err.dst, err.err)
+		failed = true
+	}
+	if failed {
+		return errors.New("some files had errors")
+	}
+	return nil
+}
+
+func main() {
+	flag.Parse()
+	if err := run(); err != nil {
+		fmt.Fprintf(os.Stderr, "Error: %s\n", err)
+		os.Exit(1)
+	}
+}
+
+const readme = `# Pre-generated files
+
+This directory contains a number of pre-generated build artifacts. To simplify
+downstream builds, they are checked into the repository, rather than dynamically
+generated as part of the build.
+
+When developing on BoringSSL, if any inputs to these files are modified, callers
+must run the following command to update the generated files:
+
+    go run ./util/pregenerate
+
+To check that files are up-to-date without updating files, run:
+
+    go run ./util/pregenerate -check
+
+This is run on CI to ensure the generated files remain up-to-date.
+
+To speed up local iteration, the tool accepts additional arguments to filter the
+files generated. For example, if editing ` + "`aesni-x86_64.pl`" + `, this
+command will only update files with "aesni-x86_64" as a substring.
+
+    go run ./util/pregenerate aesni-x86_64
+
+For convenience, all files in this directory, including this README, are managed
+by the tool. This means the whole directory may be deleted and regenerated from
+scratch at any time.
+`
diff --git a/util/pregenerate/task.go b/util/pregenerate/task.go
new file mode 100644
index 0000000..f04fc43
--- /dev/null
+++ b/util/pregenerate/task.go
@@ -0,0 +1,82 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package main
+
+import (
+	"bytes"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+)
+
+type Task interface {
+	// Destination returns the destination path for this task, using forward
+	// slashes and relative to the source directory. That is, use the "path"
+	// package, not "path/filepath".
+	Destination() string
+
+	// Run computes the output for this task. It should be written to the
+	// destination path.
+	Run() ([]byte, error)
+}
+
+type SimpleTask struct {
+	Dst     string
+	RunFunc func() ([]byte, error)
+}
+
+func (t *SimpleTask) Destination() string  { return t.Dst }
+func (t *SimpleTask) Run() ([]byte, error) { return t.RunFunc() }
+
+func NewSimpleTask(dst string, runFunc func() ([]byte, error)) *SimpleTask {
+	return &SimpleTask{Dst: dst, RunFunc: runFunc}
+}
+
+type PerlasmTask struct {
+	Src, Dst string
+	Args     []string
+}
+
+func (t *PerlasmTask) Destination() string { return t.Dst }
+func (t *PerlasmTask) Run() ([]byte, error) {
+	base := path.Base(t.Dst)
+	out, err := os.CreateTemp("", "*."+base)
+	if err != nil {
+		return nil, err
+	}
+	defer os.Remove(out.Name())
+
+	args := make([]string, 0, 2+len(t.Args))
+	args = append(args, filepath.FromSlash(t.Src))
+	args = append(args, t.Args...)
+	args = append(args, out.Name())
+	cmd := exec.Command(*perlPath, args...)
+	cmd.Stderr = os.Stderr
+	cmd.Stdout = os.Stdout
+	if err := cmd.Run(); err != nil {
+		return nil, err
+	}
+
+	data, err := os.ReadFile(out.Name())
+	if err != nil {
+		return nil, err
+	}
+
+	// On Windows, perl emits CRLF line endings. Normalize this so that the tool
+	// can be run on Windows too.
+	data = bytes.ReplaceAll(data, []byte("\r\n"), []byte("\n"))
+	return data, nil
+}