First part of the FIPS module. Change-Id: Ic3a91ccd2c8cdc364740f256fdb8a7ff66177947 Reviewed-on: https://boringssl-review.googlesource.com/14506 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>

commit: fd49993c3b94aef54669e095f1e6c417efb202aa [log] [tgz]
author: Adam Langley <agl@google.com> Tue Apr 04 14:21:43 2017 -0700
committer: Adam Langley <agl@google.com> Fri Apr 07 00:05:34 2017 +0000
tree: e1654103a8cb187ad9a2bb05bc5dbfc5fc54d846
parent: 0ef8c7bd8fe1bf50ee18c672998625b7e54f6456 [diff]
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
new file mode 100644
index 0000000..cd4a3d1
--- /dev/null
+++ b/crypto/fipsmodule/CMakeLists.txt

@@ -0,0 +1,179 @@
+include_directories(../../include)
+
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    BCM_ASM_SOURCES
+
+    md5-x86_64.${ASM_EXT}
+    sha1-x86_64.${ASM_EXT}
+    sha256-x86_64.${ASM_EXT}
+    sha512-x86_64.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "x86")
+  set(
+    BCM_ASM_SOURCES
+
+    md5-586.${ASM_EXT}
+    sha1-586.${ASM_EXT}
+    sha256-586.${ASM_EXT}
+    sha512-586.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "arm")
+  set(
+    BCM_ASM_SOURCES
+
+    sha1-armv4-large.${ASM_EXT}
+    sha256-armv4.${ASM_EXT}
+    sha512-armv4.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "aarch64")
+  set(
+    BCM_ASM_SOURCES
+
+    sha1-armv8.${ASM_EXT}
+    sha256-armv8.${ASM_EXT}
+    sha512-armv8.${ASM_EXT}
+  )
+endif()
+
+perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
+perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
+perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
+perlasm(sha1-armv4-large.${ASM_EXT} sha/asm/sha1-armv4-large.pl)
+perlasm(sha1-armv8.${ASM_EXT} sha/asm/sha1-armv8.pl)
+perlasm(sha1-x86_64.${ASM_EXT} sha/asm/sha1-x86_64.pl)
+perlasm(sha256-586.${ASM_EXT} sha/asm/sha256-586.pl)
+perlasm(sha256-armv4.${ASM_EXT} sha/asm/sha256-armv4.pl)
+perlasm(sha256-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl)
+perlasm(sha256-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl)
+perlasm(sha512-586.${ASM_EXT} sha/asm/sha512-586.pl)
+perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl)
+perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl)
+perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl)
+
+if(FIPS)
+  add_library(
+    bcm_c_generated_asm
+
+    STATIC
+
+    bcm.c
+  )
+
+  SET_TARGET_PROPERTIES(bcm_c_generated_asm PROPERTIES COMPILE_OPTIONS "-S")
+  SET_TARGET_PROPERTIES(bcm_c_generated_asm PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+  function(JOIN VALUES GLUE OUTPUT)
+    string (REPLACE ";" "${GLUE}" _TMP_STR "${VALUES}")
+    set (${OUTPUT} "${_TMP_STR}" PARENT_SCOPE)
+  endfunction()
+
+  JOIN("${BCM_ASM_SOURCES}" ",${CMAKE_CURRENT_BINARY_DIR}/" BCM_ASM_SOURCES_COMMA_SEP)
+
+  add_custom_command(
+    OUTPUT bcm-delocated.S
+    COMMAND ${GO_EXECUTABLE} run crypto/fipsmodule/delocate.go crypto/fipsmodule/ar.go -a $<TARGET_FILE:bcm_c_generated_asm> -as ${CMAKE_CURRENT_BINARY_DIR}/${BCM_ASM_SOURCES_COMMA_SEP} -o ${CMAKE_CURRENT_BINARY_DIR}/bcm-delocated.S
+    DEPENDS bcm_c_generated_asm ${BCM_ASM_SOURCES} delocate.go ar.go
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  )
+
+  add_library(
+    bcm_hashunset
+
+    STATIC
+
+    bcm-delocated.S
+  )
+
+  set_target_properties(bcm_hashunset PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  set_target_properties(bcm_hashunset PROPERTIES LINKER_LANGUAGE C)
+
+  add_executable(
+    bcm_hashunset_test
+
+    bcm_hashunset_test.c
+
+    $<TARGET_OBJECTS:crypto_base>
+    $<TARGET_OBJECTS:stack>
+    $<TARGET_OBJECTS:lhash>
+    $<TARGET_OBJECTS:err>
+    $<TARGET_OBJECTS:base64>
+    $<TARGET_OBJECTS:bytestring>
+    $<TARGET_OBJECTS:pool>
+    $<TARGET_OBJECTS:digest_extra>
+    $<TARGET_OBJECTS:cipher>
+    $<TARGET_OBJECTS:modes>
+    $<TARGET_OBJECTS:aes>
+    $<TARGET_OBJECTS:des>
+    $<TARGET_OBJECTS:rc4>
+    $<TARGET_OBJECTS:conf>
+    $<TARGET_OBJECTS:chacha>
+    $<TARGET_OBJECTS:poly1305>
+    $<TARGET_OBJECTS:curve25519>
+    $<TARGET_OBJECTS:buf>
+    $<TARGET_OBJECTS:bn>
+    $<TARGET_OBJECTS:bio>
+    $<TARGET_OBJECTS:rand>
+    $<TARGET_OBJECTS:obj>
+    $<TARGET_OBJECTS:asn1>
+    $<TARGET_OBJECTS:engine>
+    $<TARGET_OBJECTS:dh>
+    $<TARGET_OBJECTS:dsa>
+    $<TARGET_OBJECTS:rsa>
+    $<TARGET_OBJECTS:ec>
+    $<TARGET_OBJECTS:ecdh>
+    $<TARGET_OBJECTS:ecdsa>
+    $<TARGET_OBJECTS:cmac>
+    $<TARGET_OBJECTS:evp>
+    $<TARGET_OBJECTS:hkdf>
+    $<TARGET_OBJECTS:pem>
+    $<TARGET_OBJECTS:x509>
+    $<TARGET_OBJECTS:x509v3>
+    $<TARGET_OBJECTS:pkcs8_lib>
+  )
+
+  target_link_libraries(bcm_hashunset_test bcm_hashunset)
+
+  if(NOT MSVC AND NOT ANDROID)
+    target_link_libraries(bcm_hashunset_test pthread)
+  endif()
+
+  add_custom_command(
+    OUTPUT bcm.o
+    COMMAND ${GO_EXECUTABLE} run crypto/fipsmodule/inject-hash.go crypto/fipsmodule/ar.go -o ${CMAKE_CURRENT_BINARY_DIR}/bcm.o -in $<TARGET_FILE:bcm_hashunset> -bin $<TARGET_FILE:bcm_hashunset_test>
+    DEPENDS bcm_hashunset_test bcm_hashunset inject-hash.go ar.go
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  )
+
+  # The outputs of add_custom_command cannot be referenced outside of the
+  # CMakeLists.txt that defines it. Thus we have to wrap bcm.o in a custom target
+  # so that crypto can depend on it.
+  add_custom_target(bcm_o_target DEPENDS bcm.o)
+
+  add_library(
+    fipsmodule
+
+    OBJECT
+
+    is_fips.c
+  )
+
+  set_target_properties(fipsmodule PROPERTIES LINKER_LANGUAGE C)
+else()
+  add_library(
+    fipsmodule
+
+    OBJECT
+
+    bcm.c
+    is_fips.c
+
+    ${BCM_ASM_SOURCES}
+  )
+endif()

diff --git a/crypto/fipsmodule/FIPS.md b/crypto/fipsmodule/FIPS.md
new file mode 100644
index 0000000..67ba095
--- /dev/null
+++ b/crypto/fipsmodule/FIPS.md

@@ -0,0 +1,74 @@
+## Integrity Test
+
+FIPS-140 mandates that a module calculate an HMAC of its own code in a constructor function and compare the result to a known-good value. Typical code produced by a C compiler includes large numbers of relocations: places in the machine code where the linker needs to resolve and inject the final value of a symbolic expression. These relocations mean that the bytes that make up any specific bit of code generally aren't known until the final link has completed.
+
+Additionally, because of shared libraries and ASLR, some relocations can only be resolved at run-time, and thus targets of those relocations vary even after the final link.
+
+BoringSSL is linked (often statically) into a large number of binaries. It would be a significant cost if each of these binaries had to be post-processed in order to calculate the known-good HMAC value. We would much prefer if the value could be calculated, once, when BoringSSL itself is compiled.
+
+In order for the value to be calculated before the final link, there can be no relocations in the hashed code and data. This document describes how we build C and assembly code in order to produce an object file containing all the code and data for the FIPS module without that code having any relocations.
+
+First, all the C source files for the module are compiled as a single unit by compiling a single source file that `#include`s them all (this is `bcm.c`). The `-fPIC` flag is used to cause the compiler to use IP-relative addressing in many (but not all) cases. Also the `-S` flag is used to instruct the compiler to produce a textual assembly file rather than a binary object file.
+
+The textual assembly file is then processed by a script to merge in assembly implementations of some primitives and to eliminate the remaining sources of relocations.
+
+##### Redirector functions
+
+The most obvious cause of relocations are out-calls from the module to non-cryptographic functions outside of the module. Most obviously these include `malloc`, `memcpy` and other libc functions, but also include calls to support code in BoringSSL such as functions for managing the error queue.
+
+Offsets to these functions cannot be known until the final link because only the linker sees the object files containing them. Thus calls to these functions are rewritten into an IP-relative jump to a redirector function. The redirector functions contain a single jump instruction to the real function and are placed outside of the module and are thus not hashed (see diagram).
+
+![module structure](/crypto/fipsmodule/intcheck1.png)
+
+In this diagram, the integrity check hashes from `module_start` to `module_end`. Since this does not cover the jump to `memcpy`, it's fine that the linker will poke the final offset into that instruction.
+
+##### Read-only data
+
+Normally read-only data is placed in a `.data` segment that doesn't get mapped into memory with execute permissions. However, the offset of the data segment from the text segment is another thing that isn't determined until the final link. In order to fix data offsets before the link, read-only data is simply placed in the module's `.text` segment. This might make building ROP chains easier for an attacker, but so it goes.
+
+One special case is `rel.ro` data, which is data that contains function pointers. Since these function pointers are absolute, they are written by the dynamic linker at run-time and so we must eliminate them. The pattern that causes them is when we have a static `EVP_MD` or `EVP_CIPHER` object thus, inside the module, we'll change this pattern to instead to reserve space in the BSS for the object, and add a `CRYPTO_once_t` to protect its initialisation. Functions outside of the module are used to access the reserved space—they effectively act like a special-purpose `malloc` calls that cannot fail. This is implemented by the `DEFINE_METHOD_FUNCTION` macro.
+
+##### Read-write data
+
+Mutable data is a problem. It cannot be in the text segment because the text segment is mapped read-only. If it's in a different segment then the code cannot reference it with a known, IP-relative offset because the segment layout is only fixed during the final link.
+
+Thankfully, mutable data is very rare in our cryptographic code and I hope that we can get it down to just a few variables. In order to allow this we use a similar design to the redirector functions: the code references a symbol that's in the text segment, but out of the module and thus not hashed. A relocation record is emitted to instruct the linker to poke the final offset to the variable in that location. Thus the only change needed is an extra indirection when loading the value.
+
+##### Other transforms
+
+The script performs a number of other transformations which are worth noting but do not warrant their own sections:
+
+1.  It duplicates each global symbol with a local symbol that has `_local_target` appended to the name. References to the global symbols are rewritten to use these duplicates instead. Otherwise, although the generated code uses IP-relative references, relocations are emitted for global symbols in case they are overridden by a different object file during the link.
+1.  Everything in `nonmodule_rodata` and `nonmodule_text` segments is moved to the bottom of the text segment, outside the module.
+1.  Various sections, notably `.rodata`, are moved to the `.text` section, inside the module, so module code may reference it without relocations.
+1.  It inserts the labels that delimit the module's code and data (called `module_start` and `module_end` in the diagram above).
+1.  It rewrites some "dummy" references to point to those labels and that array. In order to get the C compiler to emit the correct code it's necessary to make it think that it's referencing static functions. This compensates for that trick.
+
+##### Integrity testing
+
+In order to actually implement the integrity test, a constructor function within the module calculates an HMAC from `module_start` to `module_end` using a fixed, all-zero key. It compares the result with the known-good value added (by the script) to the unhashed portion of the text segment. If they don't match, it calls `exit` in an infinite loop.
+
+Initially the known-good value will be incorrect. Another script runs the module after an object file has been produced to get the calculated value which it then injects back into the object file.
+
+![build process](/crypto/fipsmodule/intcheck2.png)
+
+### Comparison with OpenSSL's method
+
+(This is based on reading OpenSSL's [user guide](https://www.openssl.org/docs/fips/UserGuide-2.0.pdf) and inspecting the code of OpenSSL FIPS 2.0.12.)
+
+OpenSSL's solution to this problem is broadly similar but has a number of differences:
+
+1.  OpenSSL deals with run-time relocations by not hashing parts of the module's data. BoringSSL will eliminate run-time relocations instead and hash everything.
+1.  OpenSSL uses `ld -r` (the partial linking mode) to merge a number of object files into their `fipscanister.o`. For BoringSSL, we propose to merge all the C source files by building a single C file that #includes all the others, then we propose to merge the assembly sources by concatenating them to the assembly output from the C compiler.
+1.  OpenSSL depends on the link order and inserts two object files, `fips_start.o` and `fips_end.o`, in order to establish the `module_start` and `module_end` values. BoringSSL proposes to simply add labels at the correct places in the assembly.
+1.  OpenSSL calculates the hash after the final link and either injects it into the binary or recompiles with the value of the hash passed in as a #define. BoringSSL calculates it prior to the final link and injects it into the object file.
+1.  OpenSSL references read-write data directly, since it can know the offsets to it. BoringSSL indirects these loads and stores.
+1.  OpenSSL doesn't run the power-on test until `FIPS_module_mode_set` is called, BoringSSL plans to do it in a constructor function. Failure of the test is non-fatal in OpenSSL, BoringSSL plans to crash.
+1.  Since the contents of OpenSSL's module change between compilation and use, OpenSSL generates `fipscanister.o.sha1` to check that the compiled object doesn't change before linking. Since BoringSSL's module is fixed after compilation, the final integrity check is unbroken through the linking process.
+
+Some of the similarities are worth noting:
+
+1.  OpenSSL has all out-calls from the module indirecting via the PLT, which is equivalent to the redirector functions described above.
+
+
+![OpenSSL build process](/crypto/fipsmodule/intcheck3.png)

diff --git a/crypto/fipsmodule/ar.go b/crypto/fipsmodule/ar.go
new file mode 100644
index 0000000..b52a5a1
--- /dev/null
+++ b/crypto/fipsmodule/ar.go

@@ -0,0 +1,181 @@
+// Copyright (c) 2017, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// ar.go contains functions for parsing .a archive files.
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const (
+	// the string which begins a proper archive
+	arMagic = "!<arch>\n"
+
+	// the magic numbers for individual file headers
+	fileMagic = "`\n"
+
+	headerSize = 60
+)
+
+// An ARHeader represents a single header in an ar archive.
+type ARHeader struct {
+	Name    string
+	ModTime time.Time
+	UID     int
+	GID     int
+	Mode    os.FileMode
+	Size    int64
+}
+
+type slicer []byte
+
+func (sp *slicer) next(n int) (b []byte) {
+	s := *sp
+	b, *sp = s[0:n], s[n:]
+	return
+}
+
+// A Reader provides sequential access to the contents of an ar archive.
+// The Next method advances to the next file in the archive (including
+// the first), and then it can be treated as an io.Reader to access the
+// file's data.
+type AR struct {
+	r   io.Reader
+	err error
+	nb  int64 // number of unread bytes for current file entry
+	pad int   // amount of padding after current file entry
+}
+
+// NewReader returns a reader for the members of the provided ar archive.
+func NewAR(r io.Reader) *AR {
+	magiclen := len(arMagic)
+	buf := make([]byte, magiclen)
+	_, err := io.ReadFull(r, buf)
+	if err != nil || arMagic != string(buf) {
+		err = fmt.Errorf("ar: bad magic number %v in ar file header", buf)
+	}
+	return &AR{r: r, err: err}
+}
+
+// Next advances the reader to the next file in the archive.
+func (ar *AR) Next() (*ARHeader, error) {
+	var hdr *ARHeader
+	if ar.err == nil {
+		ar.skipUnread()
+	}
+	if ar.err == nil {
+		hdr = ar.readHeader()
+	}
+	return hdr, ar.err
+}
+
+func (ar *AR) cvt(b []byte, base int) int64 {
+	// Removing leading spaces
+	for len(b) > 0 && b[0] == ' ' {
+		b = b[1:]
+	}
+	// Removing trailing NULs and spaces.
+	for len(b) > 0 && (b[len(b)-1] == ' ' || b[len(b)-1] == '\x00') {
+		b = b[:len(b)-1]
+	}
+
+	if len(b) == 0 {
+		return 0
+	}
+
+	x, err := strconv.ParseUint(string(b), base, 64)
+	if err != nil {
+		ar.err = err
+	}
+	return int64(x)
+}
+
+// Skip any unused bytes in the existing file entry, as well as any alignment padding.
+func (ar *AR) skipUnread() {
+	nr := ar.nb + int64(ar.pad)
+	ar.nb, ar.pad = 0, 0
+	if sr, ok := ar.r.(io.Seeker); ok {
+		if _, err := sr.Seek(nr, io.SeekCurrent); err == nil {
+			return
+		}
+	}
+	_, ar.err = io.CopyN(ioutil.Discard, ar.r, nr)
+}
+
+func (ar *AR) readHeader() *ARHeader {
+	var n int
+	header := make([]byte, headerSize)
+	n, ar.err = io.ReadFull(ar.r, header)
+	if ar.err == io.ErrUnexpectedEOF {
+		ar.err = fmt.Errorf("ar: short header in ar archive; got %d bytes, want %d", n, headerSize)
+	}
+	if ar.err != nil {
+		// io.EOF will get passed through
+		return nil
+	}
+
+	hdr := new(ARHeader)
+	s := slicer(header)
+
+	hdr.Name = strings.TrimRight(string(s.next(16)), " ")
+	hdr.Name = strings.TrimRight(hdr.Name, "/")
+	hdr.ModTime = time.Unix(ar.cvt(s.next(12), 10), 0)
+	hdr.UID = int(ar.cvt(s.next(6), 10))
+	hdr.GID = int(ar.cvt(s.next(6), 10))
+	hdr.Mode = os.FileMode(ar.cvt(s.next(8), 8))
+	hdr.Size = ar.cvt(s.next(10), 10)
+	magic := string(s.next(2))
+	if magic != fileMagic {
+		ar.err = fmt.Errorf("ar: bad magic number %v in ar member header", magic)
+		return nil
+	}
+
+	ar.nb = int64(hdr.Size)
+	// at most one pad byte just to be even
+	ar.pad = int(ar.nb & 1)
+
+	return hdr
+}
+
+// Read reads from the current entry in the ar archive.
+// It returns 0, io.EOF when it reaches the end of that entry,
+// until Next is called to advance to the next entry.
+func (ar *AR) Read(b []byte) (n int, err error) {
+	if ar.nb == 0 {
+		// file consumed
+		return 0, io.EOF
+	}
+
+	// trim read to the amount available
+	if int64(len(b)) > ar.nb {
+		b = b[0:ar.nb]
+	}
+
+	n, err = ar.r.Read(b)
+	ar.nb -= int64(n)
+	if err == io.EOF && ar.nb > 0 {
+		// archive ended while more file contents expected
+		err = io.ErrUnexpectedEOF
+	}
+	ar.err = err
+	return
+}

diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
new file mode 100644
index 0000000..54f2320
--- /dev/null
+++ b/crypto/fipsmodule/bcm.c

@@ -0,0 +1,100 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/base.h>
+#include <openssl/cpu.h>
+#include <openssl/crypto.h>
+#include <openssl/hmac.h>
+
+#include "../internal.h"
+#include "./delocate.h"
+
+#include "digest/digest.c"
+#include "digest/digests.c"
+#include "hmac/hmac.c"
+#include "md4/md4.c"
+#include "md5/md5.c"
+#include "sha/sha1-altivec.c"
+#include "sha/sha1.c"
+#include "sha/sha256.c"
+#include "sha/sha512.c"
+
+
+#if defined(BORINGSSL_FIPS)
+static void hexdump(const uint8_t *in, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    printf("%02x", in[i]);
+  }
+}
+
+/* These functions are removed by delocate.go and references to them are
+ * rewritten to point to the start and end of the module, and the location of
+ * the integrity hash. */
+static void BORINGSSL_bcm_text_dummy_start(void) {}
+static void BORINGSSL_bcm_text_dummy_end(void) {}
+
+/* BORINGSSL_bcm_text_hash is outside the module so it may be filled in with the
+ * correct hash without a circular dependency. This must match the value used
+ * in inject-hash.go. */
+NONMODULE_RODATA static const uint8_t BORINGSSL_bcm_text_hash[32] = {
+    0x5f, 0x30, 0xd1, 0x80, 0xe7, 0x9e, 0x8f, 0x8f, 0xdf, 0x8b, 0x93,
+    0xd4, 0x96, 0x36, 0x30, 0xcc, 0x30, 0xea, 0x38, 0x0f, 0x75, 0x56,
+    0x9a, 0x1b, 0x23, 0x2f, 0x7c, 0x79, 0xff, 0x1b, 0x2b, 0xca,
+};
+
+static void BORINGSSL_bcm_power_on_self_test(void) __attribute__((constructor));
+
+static void BORINGSSL_bcm_power_on_self_test(void) {
+  CRYPTO_library_init();
+
+  const uint8_t *const start = (const uint8_t *)BORINGSSL_bcm_text_dummy_start;
+  const uint8_t *const end = (const uint8_t *)BORINGSSL_bcm_text_dummy_end;
+
+  static const uint8_t kHMACKey[32] = {0};
+  uint8_t result[SHA256_DIGEST_LENGTH];
+
+  unsigned result_len;
+  if (!HMAC(EVP_sha256(), kHMACKey, sizeof(kHMACKey), start, end - start,
+            result, &result_len) ||
+      result_len != sizeof(result)) {
+    goto err;
+  }
+
+  const uint8_t *const expected = BORINGSSL_bcm_text_hash;
+  if (OPENSSL_memcmp(expected, result, sizeof(result)) != 0) {
+    printf("FIPS integrity test failed.\nExpected: ");
+    hexdump(expected, sizeof(result));
+    printf("\nCalculated: ");
+    hexdump(result, sizeof(result));
+    printf("\n");
+    goto err;
+  }
+
+  // TODO(fips): KAT tests go here.
+
+  return;
+
+err:
+  for (;;) {
+    exit(1);
+    abort();
+  }
+}
+#endif  /* BORINGSSL_FIPS */
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+/* OPENSSL_ia32cap_addr is outside the FIPS module so the FIPS module may locate
+ * the address of |OPENSSL_ia32cap_P| without a relocation. */
+NONMODULE_RODATA uint32_t *const OPENSSL_ia32cap_addr = OPENSSL_ia32cap_P;
+#endif

diff --git a/crypto/fipsmodule/bcm_hashunset_test.c b/crypto/fipsmodule/bcm_hashunset_test.c
new file mode 100644
index 0000000..c62761e
--- /dev/null
+++ b/crypto/fipsmodule/bcm_hashunset_test.c

@@ -0,0 +1,11 @@
+#include <openssl/digest.h>
+
+#if !defined(BORINGSSL_FIPS)
+#error "This file should not be built outside of the FIPS build."
+#endif
+
+int main(void) {
+  /* This program only needs to trigger the FIPS power-on self-test. */
+  EVP_sha256();
+  return 0;
+}

diff --git a/crypto/fipsmodule/delocate.go b/crypto/fipsmodule/delocate.go
new file mode 100644
index 0000000..ab14de5
--- /dev/null
+++ b/crypto/fipsmodule/delocate.go

@@ -0,0 +1,421 @@
+// Copyright (c) 2017, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// delocate performs several transformations of textual assembly code. See
+// FIPS.md in this directory for an overview.
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"unicode/utf8"
+)
+
+func main() {
+	// The .a file, if given, is expected to be an archive of textual
+	// assembly sources. That's odd, but CMake really wants to create
+	// archive files so it's the only way that we can make it work.
+	arInput := flag.String("a", "", "Path to a .a file containing assembly sources")
+
+	outFile := flag.String("o", "", "Path to output assembly")
+	asmFiles := flag.String("as", "", "Comma separated list of assembly inputs")
+
+	flag.Parse()
+
+	var lines []string
+	var err error
+	if len(*arInput) > 0 {
+		if lines, err = arLines(lines, *arInput); err != nil {
+			panic(err)
+		}
+	}
+
+	asPaths := strings.Split(*asmFiles, ",")
+	for i, path := range asPaths {
+		if lines, err = asLines(lines, path, i); err != nil {
+			panic(err)
+		}
+	}
+
+	symbols := definedSymbols(lines)
+	lines = transform(lines, symbols)
+
+	out, err := os.OpenFile(*outFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
+	if err != nil {
+		panic(err)
+	}
+	defer out.Close()
+
+	for _, line := range lines {
+		out.WriteString(line)
+		out.WriteString("\n")
+	}
+}
+
+// isSymbolDef returns detects whether line contains a (non-local) symbol
+// definition. If so, it returns the symbol and true. Otherwise it returns ""
+// and false.
+func isSymbolDef(line string) (string, bool) {
+	line = strings.TrimSpace(line)
+
+	if len(line) > 0 && line[len(line)-1] == ':' && line[0] != '.' {
+		symbol := line[:len(line)-1]
+		if validSymbolName(symbol) {
+			return symbol, true
+		}
+	}
+
+	return "", false
+}
+
+// definedSymbols finds all (non-local) symbols from lines and returns a map
+// from symbol name to whether or not that symbol is global.
+func definedSymbols(lines []string) map[string]bool {
+	globalSymbols := make(map[string]struct{})
+	symbols := make(map[string]bool)
+
+	for _, line := range lines {
+		if len(line) == 0 {
+			continue
+		}
+
+		if symbol, ok := isSymbolDef(line); ok {
+			_, isGlobal := globalSymbols[symbol]
+			symbols[symbol] = isGlobal
+		}
+
+		parts := strings.Fields(strings.TrimSpace(line))
+		if parts[0] == ".globl" {
+			globalSymbols[parts[1]] = struct{}{}
+		}
+	}
+
+	return symbols
+}
+
+// transform performs a number of transformations on the given assembly code.
+// See FIPS.md in the current directory for an overview.
+func transform(lines []string, symbols map[string]bool) (ret []string) {
+	ret = append(ret, ".text", "BORINGSSL_bcm_text_start:")
+
+	// redirectors maps from out-call symbol name to the name of a
+	// redirector function for that symbol.
+	redirectors := make(map[string]string)
+
+	// extractedText contains lines that have been extracted from the
+	// assembly and need to be moved outside of the module. (This is used
+	// for the .init_array section that specifies constructors.)
+	var extractedText []string
+	extractingText := false
+
+	// dropping is true iff the current part of the assembly file is being
+	// dropped from the final output.
+	dropping := false
+
+	for lineNo, line := range lines {
+		if strings.Contains(line, "OPENSSL_ia32cap_P(%rip)") {
+			panic("reference to OPENSSL_ia32cap_P needs to be changed to indirect via OPENSSL_ia32cap_addr")
+		}
+
+		parts := strings.Fields(strings.TrimSpace(line))
+
+		if len(parts) == 0 {
+			ret = append(ret, line)
+			continue
+		}
+
+		switch parts[0] {
+		case ".type":
+			if strings.HasPrefix(parts[1], "BORINGSSL_bcm_text_dummy_") {
+				// This is a dummy function, drop it.
+				dropping = true
+			}
+
+		case ".size":
+			if strings.HasPrefix(parts[1], "BORINGSSL_bcm_text_dummy_") {
+				// End of dummy function that we dropped.
+				dropping = false
+				continue
+			}
+
+		case ".file":
+			// These directives must be included even when in a
+			// dummy function.
+			ret = append(ret, line)
+			continue
+		}
+
+		// Symbol definitions inside dropped functions cannot be
+		// eliminated because the debug information may reference them.
+		if dropping && !strings.HasSuffix(line, ":") {
+			continue
+		}
+
+		switch parts[0] {
+		case "call", "jmp":
+			target := parts[1]
+			// indirect via register or local label
+			if strings.HasPrefix(target, "*") || strings.HasPrefix(target, ".L") {
+				ret = append(ret, line)
+				continue
+			}
+
+			if isGlobal, ok := symbols[target]; ok {
+				newTarget := target
+				if isGlobal {
+					newTarget = localTargetName(target)
+				}
+				ret = append(ret, fmt.Sprintf("\t%s %s", parts[0], newTarget))
+				continue
+			}
+
+			redirectorName := "bcm_redirector_" + target
+
+			if strings.HasSuffix(target, "@PLT") {
+				withoutPLT := target[:len(target)-4]
+				if isGlobal, ok := symbols[withoutPLT]; ok {
+					newTarget := withoutPLT
+					if isGlobal {
+						newTarget = localTargetName(withoutPLT)
+					}
+					ret = append(ret, fmt.Sprintf("\t%s %s", parts[0], newTarget))
+					continue
+				}
+
+				redirectorName = redirectorName[:len(redirectorName)-4]
+			}
+
+			ret = append(ret, fmt.Sprintf("\t%s %s", parts[0], redirectorName))
+			redirectors[redirectorName] = target
+			continue
+
+		case ".section":
+			extractingText = false
+
+			p := strings.Split(parts[1], ",")
+			section := p[0]
+
+			if section == ".rodata" || section == ".text.startup" || strings.HasPrefix(section, ".rodata.") {
+				// Move .rodata to .text so it may be accessed
+				// without a relocation. GCC with
+				// -fmerge-constants will place strings into
+				// separate sections, so we move all sections
+				// named like .rodata. Also move .text.startup
+				// so the self-test function is also in the
+				// module.
+				ret = append(ret, ".text  # "+section)
+				break
+			}
+
+			switch section {
+			case ".data", ".data.rel.ro.local":
+				panic(fmt.Sprintf("bad section %q on line %d", parts[1], lineNo+1))
+
+			case ".init_array":
+				// init_array contains function pointers to
+				// constructor functions. Since these must be
+				// relocated, this section is moved to the end
+				// of the file.
+				extractedText = append(extractedText, line)
+				extractingText = true
+
+			case "nonmodule_rodata", "nonmodule_text":
+				// Symbols in either nonmodule_rodata or
+				// nonmodule_text get moved to the text section
+				// outside the module. These are used to avoid
+				// relocations in referencing data.
+				extractedText = append(extractedText, ".text  # "+section)
+				extractingText = true
+
+			default:
+				ret = append(ret, line)
+			}
+
+		case ".text":
+			extractingText = false
+			fallthrough
+
+		default:
+			if extractingText {
+				extractedText = append(extractedText, line)
+				continue
+			}
+
+			if symbol, ok := isSymbolDef(line); ok {
+				if isGlobal := symbols[symbol]; isGlobal {
+					ret = append(ret, localTargetName(symbol)+":")
+				}
+			}
+
+			if parts[0] == "leaq" {
+				line = strings.Replace(line, "BORINGSSL_bcm_text_dummy_", "BORINGSSL_bcm_text_", -1)
+			}
+			ret = append(ret, line)
+		}
+	}
+
+	ret = append(ret, "BORINGSSL_bcm_text_end:")
+
+	// Emit redirector functions. Each is a single JMP instruction.
+	for redirectorName, target := range redirectors {
+		ret = append(ret, ".type "+redirectorName+", @function")
+		ret = append(ret, redirectorName+":")
+		ret = append(ret, "\tjmp "+target)
+	}
+
+	ret = append(ret, extractedText...)
+
+	return ret
+}
+
+// localTargetName returns the name of the local target label for a global
+// symbol named name.
+func localTargetName(name string) string {
+	return ".L" + name + "_local_target"
+}
+
+// asLines appends the contents of path to lines. Local symbols are renamed
+// using uniqueId to avoid collisions.
+func asLines(lines []string, path string, uniqueId int) ([]string, error) {
+	basename := symbolRuneOrUnderscore(filepath.Base(path))
+
+	asFile, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer asFile.Close()
+
+	var contents []string
+
+	// localSymbols maps from the symbol name used in the input, to a
+	// unique symbol name.
+	localSymbols := make(map[string]string)
+
+	scanner := bufio.NewScanner(asFile)
+	for scanner.Scan() {
+		line := scanner.Text()
+		trimmed := strings.TrimSpace(line)
+		if strings.HasPrefix(trimmed, ".L") && strings.HasSuffix(trimmed, ":") {
+			symbol := trimmed[:len(trimmed)-1]
+			mappedSymbol := fmt.Sprintf(".L%s_%d_%s", basename, uniqueId, symbol[2:])
+			localSymbols[symbol] = mappedSymbol
+			contents = append(contents, mappedSymbol+":")
+			continue
+		}
+
+		contents = append(contents, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+
+	for _, line := range contents {
+		for symbol, mappedSymbol := range localSymbols {
+			for i := strings.Index(line, symbol); i >= 0; i = strings.Index(line[i:], symbol) {
+				before := ' '
+				if i > 0 {
+					before, _ = utf8.DecodeLastRuneInString(line[:i])
+				}
+
+				after, _ := utf8.DecodeRuneInString(line[i+len(symbol):])
+
+				if !symbolRune(before) && !symbolRune(after) {
+					line = strings.Replace(line, symbol, mappedSymbol, 1)
+					i += len(mappedSymbol)
+				} else {
+					i += len(symbol)
+				}
+			}
+		}
+
+		lines = append(lines, line)
+	}
+
+	return lines, nil
+}
+
+func arLines(lines []string, arPath string) ([]string, error) {
+	arFile, err := os.Open(arPath)
+	if err != nil {
+		return nil, err
+	}
+	defer arFile.Close()
+
+	ar := NewAR(arFile)
+
+	for {
+		header, err := ar.Next()
+		if err == io.EOF {
+			return lines, nil
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		if len(header.Name) == 0 {
+			continue
+		}
+
+		scanner := bufio.NewScanner(ar)
+		for scanner.Scan() {
+			lines = append(lines, scanner.Text())
+		}
+		if err := scanner.Err(); err != nil {
+			return nil, err
+		}
+	}
+}
+
+// validSymbolName returns true if s is a valid (non-local) name for a symbol.
+func validSymbolName(s string) bool {
+	if len(s) == 0 {
+		return false
+	}
+
+	r, n := utf8.DecodeRuneInString(s)
+	// symbols don't start with a digit.
+	if r == utf8.RuneError || !symbolRune(r) || ('0' <= s[0] && s[0] <= '9') {
+		return false
+	}
+
+	return strings.IndexFunc(s[n:], func(r rune) bool {
+		return !symbolRune(r)
+	}) == -1
+}
+
+// symbolRune returns true if r is valid in a symbol name.
+func symbolRune(r rune) bool {
+	return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '$' || r == '_'
+}
+
+// symbolRuneOrUnderscore maps s where runes valid in a symbol name map to
+// themselves and all other runs map to underscore.
+func symbolRuneOrUnderscore(s string) string {
+	runes := make([]rune, 0, len(s))
+
+	for _, r := range s {
+		if symbolRune(r) {
+			runes = append(runes, r)
+		} else {
+			runes = append(runes, '_')
+		}
+	}
+
+	return string(runes)
+}

diff --git a/crypto/fipsmodule/delocate.h b/crypto/fipsmodule/delocate.h
new file mode 100644
index 0000000..753a5da
--- /dev/null
+++ b/crypto/fipsmodule/delocate.h

@@ -0,0 +1,78 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_FIPSMODULE_DELOCATE_H
+#define OPENSSL_HEADER_FIPSMODULE_DELOCATE_H
+
+#include <openssl/base.h>
+
+#include "../internal.h"
+
+
+#if defined(BORINGSSL_FIPS)
+
+/* NONMODULE_RODATA, in FIPS mode, causes delocate.go to move the specified
+ * const global to the unhashed non-module area of code located after the
+ * module. */
+#define NONMODULE_RODATA __attribute__((section("nonmodule_rodata")))
+
+/* NONMODULE_TEXT, in FIPS mode, causes delocate.go to move the specified
+ * function to the unhashed non-module area of code located after the module. We
+ * mark such functions noinline to prevent module callers from inlining the
+ * relocations into themselves. */
+#define NONMODULE_TEXT __attribute__((section("nonmodule_text"), noinline))
+
+/* DEFINE_METHOD_FUNCTION defines a function named |name| which returns a method
+ * table of type const |type|*, initialized by |initializer|. In FIPS mode, to
+ * avoid rel.ro data, it is split into a CRYPTO_once_t-guarded initializer in
+ * the module and unhashed, non-module accessor functions to space reserved in
+ * the BSS. This does not use a static initializer because their execution order
+ * is undefined. See FIPS.md for more details.
+ *
+ * __VA_ARGS__ is used to get around macros not allowing arguments with
+ * commas. */
+#define DEFINE_METHOD_FUNCTION(type, name, ... /* initializer */) \
+  NONMODULE_TEXT static type *name##_bss_get(void) {              \
+    static type ret;                                              \
+    return &ret;                                                  \
+  }                                                               \
+                                                                  \
+  NONMODULE_TEXT static CRYPTO_once_t *name##_once_get(void) {    \
+    static CRYPTO_once_t ret = CRYPTO_ONCE_INIT;                  \
+    return &ret;                                                  \
+  }                                                               \
+                                                                  \
+  static void name##_init(void) {                                 \
+    type ret = __VA_ARGS__;                                       \
+    OPENSSL_memcpy(name##_bss_get(), &ret, sizeof(ret));          \
+  }                                                               \
+                                                                  \
+  const type *name(void) {                                        \
+    CRYPTO_once(name##_once_get(), name##_init);                  \
+    return name##_bss_get();                                      \
+  }
+
+#else /* !BORINGSSL_FIPS */
+
+#define NONMODULE_RODATA
+#define NONMODULE_TEXT
+#define DEFINE_METHOD_FUNCTION(type, name, ...) \
+  const type *name(void) {                      \
+    static const type ret = __VA_ARGS__;        \
+    return &ret;                                \
+  }
+
+#endif /* BORINGSSL_FIPS */
+
+#endif /* OPENSSL_HEADER_FIPSMODULE_DELOCATE_H */

diff --git a/crypto/fipsmodule/digest/digest.c b/crypto/fipsmodule/digest/digest.c
new file mode 100644
index 0000000..00e6d4b
--- /dev/null
+++ b/crypto/fipsmodule/digest/digest.c

@@ -0,0 +1,251 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/digest.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+int EVP_MD_type(const EVP_MD *md) { return md->type; }
+
+uint32_t EVP_MD_flags(const EVP_MD *md) { return md->flags; }
+
+size_t EVP_MD_size(const EVP_MD *md) { return md->md_size; }
+
+size_t EVP_MD_block_size(const EVP_MD *md) { return md->block_size; }
+
+
+void EVP_MD_CTX_init(EVP_MD_CTX *ctx) {
+  OPENSSL_memset(ctx, 0, sizeof(EVP_MD_CTX));
+}
+
+EVP_MD_CTX *EVP_MD_CTX_create(void) {
+  EVP_MD_CTX *ctx = OPENSSL_malloc(sizeof(EVP_MD_CTX));
+
+  if (ctx) {
+    EVP_MD_CTX_init(ctx);
+  }
+
+  return ctx;
+}
+
+int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) {
+  if (ctx->digest && ctx->digest->ctx_size && ctx->md_data) {
+    OPENSSL_cleanse(ctx->md_data, ctx->digest->ctx_size);
+    OPENSSL_free(ctx->md_data);
+  }
+
+  assert(ctx->pctx == NULL || ctx->pctx_ops != NULL);
+  if (ctx->pctx_ops) {
+    ctx->pctx_ops->free(ctx->pctx);
+  }
+
+  EVP_MD_CTX_init(ctx);
+
+  return 1;
+}
+
+void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx) {
+  if (!ctx) {
+    return;
+  }
+
+  EVP_MD_CTX_cleanup(ctx);
+  OPENSSL_free(ctx);
+}
+
+int EVP_MD_CTX_copy_ex(EVP_MD_CTX *out, const EVP_MD_CTX *in) {
+  uint8_t *tmp_buf = NULL;
+
+  if (in == NULL || in->digest == NULL) {
+    OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_INPUT_NOT_INITIALIZED);
+    return 0;
+  }
+
+  if (out->digest == in->digest) {
+    /* |md_data| will be the correct size in this case so it's removed from
+     * |out| at this point so that |EVP_MD_CTX_cleanup| doesn't free it and
+     * then it's reused. */
+    tmp_buf = out->md_data;
+    out->md_data = NULL;
+  }
+
+  EVP_MD_CTX_cleanup(out);
+
+  out->digest = in->digest;
+  if (in->md_data && in->digest->ctx_size) {
+    if (tmp_buf) {
+      out->md_data = tmp_buf;
+    } else {
+      out->md_data = OPENSSL_malloc(in->digest->ctx_size);
+      if (!out->md_data) {
+        OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE);
+        return 0;
+      }
+    }
+    OPENSSL_memcpy(out->md_data, in->md_data, in->digest->ctx_size);
+  }
+
+  assert(in->pctx == NULL || in->pctx_ops != NULL);
+  out->pctx_ops = in->pctx_ops;
+  if (in->pctx && in->pctx_ops) {
+    out->pctx = in->pctx_ops->dup(in->pctx);
+    if (!out->pctx) {
+      EVP_MD_CTX_cleanup(out);
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in) {
+  EVP_MD_CTX_init(out);
+  return EVP_MD_CTX_copy_ex(out, in);
+}
+
+int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *engine) {
+  if (ctx->digest != type) {
+    if (ctx->digest && ctx->digest->ctx_size > 0) {
+      OPENSSL_free(ctx->md_data);
+      ctx->md_data = NULL;
+    }
+    ctx->digest = type;
+    if (type->ctx_size > 0) {
+      ctx->md_data = OPENSSL_malloc(type->ctx_size);
+      if (ctx->md_data == NULL) {
+        OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE);
+        return 0;
+      }
+    }
+  }
+
+  assert(ctx->pctx == NULL || ctx->pctx_ops != NULL);
+
+  ctx->digest->init(ctx);
+  return 1;
+}
+
+int EVP_DigestInit(EVP_MD_CTX *ctx, const EVP_MD *type) {
+  EVP_MD_CTX_init(ctx);
+  return EVP_DigestInit_ex(ctx, type, NULL);
+}
+
+int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) {
+  ctx->digest->update(ctx, data, len);
+  return 1;
+}
+
+int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, uint8_t *md_out, unsigned int *size) {
+  assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
+  ctx->digest->final(ctx, md_out);
+  if (size != NULL) {
+    *size = ctx->digest->md_size;
+  }
+  OPENSSL_cleanse(ctx->md_data, ctx->digest->ctx_size);
+  return 1;
+}
+
+int EVP_DigestFinal(EVP_MD_CTX *ctx, uint8_t *md, unsigned int *size) {
+  (void)EVP_DigestFinal_ex(ctx, md, size);
+  EVP_MD_CTX_cleanup(ctx);
+  return 1;
+}
+
+int EVP_Digest(const void *data, size_t count, uint8_t *out_md,
+               unsigned int *out_size, const EVP_MD *type, ENGINE *impl) {
+  EVP_MD_CTX ctx;
+  int ret;
+
+  EVP_MD_CTX_init(&ctx);
+  ret = EVP_DigestInit_ex(&ctx, type, impl) &&
+        EVP_DigestUpdate(&ctx, data, count) &&
+        EVP_DigestFinal_ex(&ctx, out_md, out_size);
+  EVP_MD_CTX_cleanup(&ctx);
+
+  return ret;
+}
+
+
+const EVP_MD *EVP_MD_CTX_md(const EVP_MD_CTX *ctx) {
+  if (ctx == NULL) {
+    return NULL;
+  }
+  return ctx->digest;
+}
+
+size_t EVP_MD_CTX_size(const EVP_MD_CTX *ctx) {
+  return EVP_MD_size(EVP_MD_CTX_md(ctx));
+}
+
+size_t EVP_MD_CTX_block_size(const EVP_MD_CTX *ctx) {
+  return EVP_MD_block_size(EVP_MD_CTX_md(ctx));
+}
+
+int EVP_MD_CTX_type(const EVP_MD_CTX *ctx) {
+  return EVP_MD_type(EVP_MD_CTX_md(ctx));
+}
+
+int EVP_add_digest(const EVP_MD *digest) {
+  return 1;
+}

diff --git a/crypto/fipsmodule/digest/digests.c b/crypto/fipsmodule/digest/digests.c
new file mode 100644
index 0000000..9799604
--- /dev/null
+++ b/crypto/fipsmodule/digest/digests.c

@@ -0,0 +1,250 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/digest.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/md4.h>
+#include <openssl/md5.h>
+#include <openssl/nid.h>
+#include <openssl/sha.h>
+
+#include "internal.h"
+#include "../delocate.h"
+#include "../../internal.h"
+
+#if defined(NDEBUG)
+#define CHECK(x) (void) (x)
+#else
+#define CHECK(x) assert(x)
+#endif
+
+
+static void md4_init(EVP_MD_CTX *ctx) {
+  CHECK(MD4_Init(ctx->md_data));
+}
+
+static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(MD4_Update(ctx->md_data, data, count));
+}
+
+static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) {
+  CHECK(MD4_Final(out, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md4,
+                       {
+                           NID_md4, MD4_DIGEST_LENGTH, 0 /* flags */, md4_init,
+                           md4_update, md4_final, 64 /* block size */,
+                           sizeof(MD4_CTX),
+                       })
+
+
+static void md5_init(EVP_MD_CTX *ctx) {
+  CHECK(MD5_Init(ctx->md_data));
+}
+
+static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(MD5_Update(ctx->md_data, data, count));
+}
+
+static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) {
+  CHECK(MD5_Final(out, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5,
+                       {
+                           NID_md5, MD5_DIGEST_LENGTH, 0 /* flags */, md5_init,
+                           md5_update, md5_final, 64 /* block size */,
+                           sizeof(MD5_CTX),
+                       })
+
+
+static void sha1_init(EVP_MD_CTX *ctx) {
+  CHECK(SHA1_Init(ctx->md_data));
+}
+
+static void sha1_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(SHA1_Update(ctx->md_data, data, count));
+}
+
+static void sha1_final(EVP_MD_CTX *ctx, uint8_t *md) {
+  CHECK(SHA1_Final(md, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha1,
+                       {
+                           NID_sha1, SHA_DIGEST_LENGTH, 0 /* flags */,
+                           sha1_init, sha1_update, sha1_final,
+                           64 /* block size */, sizeof(SHA_CTX),
+                       })
+
+
+static void sha224_init(EVP_MD_CTX *ctx) {
+  CHECK(SHA224_Init(ctx->md_data));
+}
+
+static void sha224_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(SHA224_Update(ctx->md_data, data, count));
+}
+
+static void sha224_final(EVP_MD_CTX *ctx, uint8_t *md) {
+  CHECK(SHA224_Final(md, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha224,
+                       {
+                           NID_sha224, SHA224_DIGEST_LENGTH, 0 /* flags */,
+                           sha224_init, sha224_update, sha224_final,
+                           64 /* block size */, sizeof(SHA256_CTX),
+                       })
+
+
+static void sha256_init(EVP_MD_CTX *ctx) {
+  CHECK(SHA256_Init(ctx->md_data));
+}
+
+static void sha256_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(SHA256_Update(ctx->md_data, data, count));
+}
+
+static void sha256_final(EVP_MD_CTX *ctx, uint8_t *md) {
+  CHECK(SHA256_Final(md, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha256,
+                       {
+                           NID_sha256, SHA256_DIGEST_LENGTH, 0 /* flags */,
+                           sha256_init, sha256_update, sha256_final,
+                           64 /* block size */, sizeof(SHA256_CTX),
+                       })
+
+
+static void sha384_init(EVP_MD_CTX *ctx) {
+  CHECK(SHA384_Init(ctx->md_data));
+}
+
+static void sha384_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(SHA384_Update(ctx->md_data, data, count));
+}
+
+static void sha384_final(EVP_MD_CTX *ctx, uint8_t *md) {
+  CHECK(SHA384_Final(md, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha384,
+                       {
+                           NID_sha384, SHA384_DIGEST_LENGTH, 0 /* flags */,
+                           sha384_init, sha384_update, sha384_final,
+                           128 /* block size */, sizeof(SHA512_CTX),
+                       })
+
+
+static void sha512_init(EVP_MD_CTX *ctx) {
+  CHECK(SHA512_Init(ctx->md_data));
+}
+
+static void sha512_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  CHECK(SHA512_Update(ctx->md_data, data, count));
+}
+
+static void sha512_final(EVP_MD_CTX *ctx, uint8_t *md) {
+  CHECK(SHA512_Final(md, ctx->md_data));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512,
+                       {
+                           NID_sha512, SHA512_DIGEST_LENGTH, 0 /* flags */,
+                           sha512_init, sha512_update, sha512_final,
+                           128 /* block size */, sizeof(SHA512_CTX),
+                       })
+
+
+typedef struct {
+  MD5_CTX md5;
+  SHA_CTX sha1;
+} MD5_SHA1_CTX;
+
+static void md5_sha1_init(EVP_MD_CTX *md_ctx) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1));
+}
+
+static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data,
+                            size_t count) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  CHECK(MD5_Update(&ctx->md5, data, count) &&
+        SHA1_Update(&ctx->sha1, data, count));
+}
+
+static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  CHECK(MD5_Final(out, &ctx->md5) &&
+        SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1));
+}
+
+DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5_sha1,
+                       {
+                           NID_md5_sha1, MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH,
+                           0 /* flags */, md5_sha1_init, md5_sha1_update,
+                           md5_sha1_final, 64 /* block size */,
+                           sizeof(MD5_SHA1_CTX),
+                       })
+
+
+#undef CHECK

diff --git a/crypto/fipsmodule/digest/internal.h b/crypto/fipsmodule/digest/internal.h
new file mode 100644
index 0000000..e3d812a
--- /dev/null
+++ b/crypto/fipsmodule/digest/internal.h

@@ -0,0 +1,112 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#ifndef OPENSSL_HEADER_DIGEST_INTERNAL_H
+#define OPENSSL_HEADER_DIGEST_INTERNAL_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+struct env_md_st {
+  /* type contains a NID identifing the digest function. (For example,
+   * NID_md5.) */
+  int type;
+
+  /* md_size contains the size, in bytes, of the resulting digest. */
+  unsigned md_size;
+
+  /* flags contains the OR of |EVP_MD_FLAG_*| values. */
+  uint32_t flags;
+
+  /* init initialises the state in |ctx->md_data|. */
+  void (*init)(EVP_MD_CTX *ctx);
+
+  /* update hashes |len| bytes of |data| into the state in |ctx->md_data|. */
+  void (*update)(EVP_MD_CTX *ctx, const void *data, size_t count);
+
+  /* final completes the hash and writes |md_size| bytes of digest to |out|. */
+  void (*final)(EVP_MD_CTX *ctx, uint8_t *out);
+
+  /* block_size contains the hash's native block size. */
+  unsigned block_size;
+
+  /* ctx_size contains the size, in bytes, of the state of the hash function. */
+  unsigned ctx_size;
+};
+
+/* evp_md_pctx_ops contains function pointers to allow the |pctx| member of
+ * |EVP_MD_CTX| to be manipulated without breaking layering by calling EVP
+ * functions. */
+struct evp_md_pctx_ops {
+  /* free is called when an |EVP_MD_CTX| is being freed and the |pctx| also
+   * needs to be freed. */
+  void (*free) (EVP_PKEY_CTX *pctx);
+
+  /* dup is called when an |EVP_MD_CTX| is copied and so the |pctx| also needs
+   * to be copied. */
+  EVP_PKEY_CTX* (*dup) (EVP_PKEY_CTX *pctx);
+};
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#endif  /* OPENSSL_HEADER_DIGEST_INTERNAL */

diff --git a/crypto/fipsmodule/digest/md32_common.h b/crypto/fipsmodule/digest/md32_common.h
new file mode 100644
index 0000000..7371629
--- /dev/null
+++ b/crypto/fipsmodule/digest/md32_common.h

@@ -0,0 +1,269 @@
+/* ====================================================================
+ * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#include <openssl/base.h>
+
+#include <assert.h>
+
+#include "../../internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+/* This is a generic 32-bit "collector" for message digest algorithms. It
+ * collects input character stream into chunks of 32-bit values and invokes the
+ * block function that performs the actual hash calculations. To make use of
+ * this mechanism, the following macros must be defined before including
+ * md32_common.h.
+ *
+ * One of |DATA_ORDER_IS_BIG_ENDIAN| or |DATA_ORDER_IS_LITTLE_ENDIAN| must be
+ * defined to specify the byte order of the input stream.
+ *
+ * |HASH_CBLOCK| must be defined as the integer block size, in bytes.
+ *
+ * |HASH_CTX| must be defined as the name of the context structure, which must
+ * have at least the following members:
+ *
+ *     typedef struct <name>_state_st {
+ *       uint32_t h[<chaining length> / sizeof(uint32_t)];
+ *       uint32_t Nl, Nh;
+ *       uint8_t data[HASH_CBLOCK];
+ *       unsigned num;
+ *       ...
+ *     } <NAME>_CTX;
+ *
+ * <chaining length> is the output length of the hash in bytes, before
+ * any truncation (e.g. 64 for SHA-224 and SHA-256, 128 for SHA-384 and
+ * SHA-512).
+ *
+ * |HASH_UPDATE| must be defined as the name of the "Update" function to
+ * generate.
+ *
+ * |HASH_TRANSFORM| must be defined as the  the name of the "Transform"
+ * function to generate.
+ *
+ * |HASH_FINAL| must be defined as the name of "Final" function to generate.
+ *
+ * |HASH_BLOCK_DATA_ORDER| must be defined as the name of the "Block" function.
+ * That function must be implemented manually. It must be capable of operating
+ * on *unaligned* input data in its original (data) byte order. It must have
+ * this signature:
+ *
+ *     void HASH_BLOCK_DATA_ORDER(uint32_t *state, const uint8_t *data,
+ *                                size_t num);
+ *
+ * It must update the hash state |state| with |num| blocks of data from |data|,
+ * where each block is |HASH_CBLOCK| bytes; i.e. |data| points to a array of
+ * |HASH_CBLOCK * num| bytes. |state| points to the |h| member of a |HASH_CTX|,
+ * and so will have |<chaining length> / sizeof(uint32_t)| elements.
+ *
+ * |HASH_MAKE_STRING(c, s)| must be defined as a block statement that converts
+ * the hash state |c->h| into the output byte order, storing the result in |s|.
+ */
+
+#if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#error "DATA_ORDER must be defined!"
+#endif
+
+#ifndef HASH_CBLOCK
+#error "HASH_CBLOCK must be defined!"
+#endif
+#ifndef HASH_CTX
+#error "HASH_CTX must be defined!"
+#endif
+
+#ifndef HASH_UPDATE
+#error "HASH_UPDATE must be defined!"
+#endif
+#ifndef HASH_TRANSFORM
+#error "HASH_TRANSFORM must be defined!"
+#endif
+#ifndef HASH_FINAL
+#error "HASH_FINAL must be defined!"
+#endif
+
+#ifndef HASH_BLOCK_DATA_ORDER
+#error "HASH_BLOCK_DATA_ORDER must be defined!"
+#endif
+
+#ifndef HASH_MAKE_STRING
+#error "HASH_MAKE_STRING must be defined!"
+#endif
+
+#if defined(DATA_ORDER_IS_BIG_ENDIAN)
+
+#define HOST_c2l(c, l)                     \
+  do {                                     \
+    (l) = (((uint32_t)(*((c)++))) << 24);  \
+    (l) |= (((uint32_t)(*((c)++))) << 16); \
+    (l) |= (((uint32_t)(*((c)++))) << 8);  \
+    (l) |= (((uint32_t)(*((c)++))));       \
+  } while (0)
+
+#define HOST_l2c(l, c)                        \
+  do {                                        \
+    *((c)++) = (uint8_t)(((l) >> 24) & 0xff); \
+    *((c)++) = (uint8_t)(((l) >> 16) & 0xff); \
+    *((c)++) = (uint8_t)(((l) >> 8) & 0xff);  \
+    *((c)++) = (uint8_t)(((l)) & 0xff);       \
+  } while (0)
+
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+
+#define HOST_c2l(c, l)                     \
+  do {                                     \
+    (l) = (((uint32_t)(*((c)++))));        \
+    (l) |= (((uint32_t)(*((c)++))) << 8);  \
+    (l) |= (((uint32_t)(*((c)++))) << 16); \
+    (l) |= (((uint32_t)(*((c)++))) << 24); \
+  } while (0)
+
+#define HOST_l2c(l, c)                        \
+  do {                                        \
+    *((c)++) = (uint8_t)(((l)) & 0xff);       \
+    *((c)++) = (uint8_t)(((l) >> 8) & 0xff);  \
+    *((c)++) = (uint8_t)(((l) >> 16) & 0xff); \
+    *((c)++) = (uint8_t)(((l) >> 24) & 0xff); \
+  } while (0)
+
+#endif /* DATA_ORDER */
+
+int HASH_UPDATE(HASH_CTX *c, const void *data_, size_t len) {
+  const uint8_t *data = data_;
+
+  if (len == 0) {
+    return 1;
+  }
+
+  uint32_t l = c->Nl + (((uint32_t)len) << 3);
+  if (l < c->Nl) {
+    /* Handle carries. */
+    c->Nh++;
+  }
+  c->Nh += (uint32_t)(len >> 29);
+  c->Nl = l;
+
+  size_t n = c->num;
+  if (n != 0) {
+    if (len >= HASH_CBLOCK || len + n >= HASH_CBLOCK) {
+      OPENSSL_memcpy(c->data + n, data, HASH_CBLOCK - n);
+      HASH_BLOCK_DATA_ORDER(c->h, c->data, 1);
+      n = HASH_CBLOCK - n;
+      data += n;
+      len -= n;
+      c->num = 0;
+      /* Keep |c->data| zeroed when unused. */
+      OPENSSL_memset(c->data, 0, HASH_CBLOCK);
+    } else {
+      OPENSSL_memcpy(c->data + n, data, len);
+      c->num += (unsigned)len;
+      return 1;
+    }
+  }
+
+  n = len / HASH_CBLOCK;
+  if (n > 0) {
+    HASH_BLOCK_DATA_ORDER(c->h, data, n);
+    n *= HASH_CBLOCK;
+    data += n;
+    len -= n;
+  }
+
+  if (len != 0) {
+    c->num = (unsigned)len;
+    OPENSSL_memcpy(c->data, data, len);
+  }
+  return 1;
+}
+
+
+void HASH_TRANSFORM(HASH_CTX *c, const uint8_t *data) {
+  HASH_BLOCK_DATA_ORDER(c->h, data, 1);
+}
+
+
+int HASH_FINAL(uint8_t *md, HASH_CTX *c) {
+  /* |c->data| always has room for at least one byte. A full block would have
+   * been consumed. */
+  size_t n = c->num;
+  assert(n < HASH_CBLOCK);
+  c->data[n] = 0x80;
+  n++;
+
+  /* Fill the block with zeros if there isn't room for a 64-bit length. */
+  if (n > (HASH_CBLOCK - 8)) {
+    OPENSSL_memset(c->data + n, 0, HASH_CBLOCK - n);
+    n = 0;
+    HASH_BLOCK_DATA_ORDER(c->h, c->data, 1);
+  }
+  OPENSSL_memset(c->data + n, 0, HASH_CBLOCK - 8 - n);
+
+  /* Append a 64-bit length to the block and process it. */
+  uint8_t *p = c->data + HASH_CBLOCK - 8;
+#if defined(DATA_ORDER_IS_BIG_ENDIAN)
+  HOST_l2c(c->Nh, p);
+  HOST_l2c(c->Nl, p);
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+  HOST_l2c(c->Nl, p);
+  HOST_l2c(c->Nh, p);
+#endif
+  assert(p == c->data + HASH_CBLOCK);
+  HASH_BLOCK_DATA_ORDER(c->h, c->data, 1);
+  c->num = 0;
+  OPENSSL_memset(c->data, 0, HASH_CBLOCK);
+
+  HASH_MAKE_STRING(c, md);
+  return 1;
+}
+
+
+#if defined(__cplusplus)
+} /* extern C */
+#endif

diff --git a/crypto/fipsmodule/hmac/hmac.c b/crypto/fipsmodule/hmac/hmac.c
new file mode 100644
index 0000000..3292350
--- /dev/null
+++ b/crypto/fipsmodule/hmac/hmac.c

@@ -0,0 +1,206 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/hmac.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/digest.h>
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+uint8_t *HMAC(const EVP_MD *evp_md, const void *key, size_t key_len,
+              const uint8_t *data, size_t data_len, uint8_t *out,
+              unsigned int *out_len) {
+  HMAC_CTX ctx;
+  HMAC_CTX_init(&ctx);
+  if (!HMAC_Init_ex(&ctx, key, key_len, evp_md, NULL) ||
+      !HMAC_Update(&ctx, data, data_len) ||
+      !HMAC_Final(&ctx, out, out_len)) {
+    out = NULL;
+  }
+
+  HMAC_CTX_cleanup(&ctx);
+  return out;
+}
+
+void HMAC_CTX_init(HMAC_CTX *ctx) {
+  ctx->md = NULL;
+  EVP_MD_CTX_init(&ctx->i_ctx);
+  EVP_MD_CTX_init(&ctx->o_ctx);
+  EVP_MD_CTX_init(&ctx->md_ctx);
+}
+
+void HMAC_CTX_cleanup(HMAC_CTX *ctx) {
+  EVP_MD_CTX_cleanup(&ctx->i_ctx);
+  EVP_MD_CTX_cleanup(&ctx->o_ctx);
+  EVP_MD_CTX_cleanup(&ctx->md_ctx);
+  OPENSSL_cleanse(ctx, sizeof(HMAC_CTX));
+}
+
+int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, size_t key_len,
+                 const EVP_MD *md, ENGINE *impl) {
+  if (md == NULL) {
+    md = ctx->md;
+  }
+
+  /* If either |key| is non-NULL or |md| has changed, initialize with a new key
+   * rather than rewinding the previous one.
+   *
+   * TODO(davidben,eroman): Passing the previous |md| with a NULL |key| is
+   * ambiguous between using the empty key and reusing the previous key. There
+   * exist callers which intend the latter, but the former is an awkward edge
+   * case. Fix to API to avoid this. */
+  if (md != ctx->md || key != NULL) {
+    uint8_t pad[EVP_MAX_MD_BLOCK_SIZE];
+    uint8_t key_block[EVP_MAX_MD_BLOCK_SIZE];
+    unsigned key_block_len;
+
+    size_t block_size = EVP_MD_block_size(md);
+    assert(block_size <= sizeof(key_block));
+    if (block_size < key_len) {
+      /* Long keys are hashed. */
+      if (!EVP_DigestInit_ex(&ctx->md_ctx, md, impl) ||
+          !EVP_DigestUpdate(&ctx->md_ctx, key, key_len) ||
+          !EVP_DigestFinal_ex(&ctx->md_ctx, key_block, &key_block_len)) {
+        return 0;
+      }
+    } else {
+      assert(key_len <= sizeof(key_block));
+      OPENSSL_memcpy(key_block, key, key_len);
+      key_block_len = (unsigned)key_len;
+    }
+    /* Keys are then padded with zeros. */
+    if (key_block_len != EVP_MAX_MD_BLOCK_SIZE) {
+      OPENSSL_memset(&key_block[key_block_len], 0, sizeof(key_block) - key_block_len);
+    }
+
+    for (size_t i = 0; i < EVP_MAX_MD_BLOCK_SIZE; i++) {
+      pad[i] = 0x36 ^ key_block[i];
+    }
+    if (!EVP_DigestInit_ex(&ctx->i_ctx, md, impl) ||
+        !EVP_DigestUpdate(&ctx->i_ctx, pad, EVP_MD_block_size(md))) {
+      return 0;
+    }
+
+    for (size_t i = 0; i < EVP_MAX_MD_BLOCK_SIZE; i++) {
+      pad[i] = 0x5c ^ key_block[i];
+    }
+    if (!EVP_DigestInit_ex(&ctx->o_ctx, md, impl) ||
+        !EVP_DigestUpdate(&ctx->o_ctx, pad, EVP_MD_block_size(md))) {
+      return 0;
+    }
+
+    ctx->md = md;
+  }
+
+  if (!EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->i_ctx)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+int HMAC_Update(HMAC_CTX *ctx, const uint8_t *data, size_t data_len) {
+  return EVP_DigestUpdate(&ctx->md_ctx, data, data_len);
+}
+
+int HMAC_Final(HMAC_CTX *ctx, uint8_t *out, unsigned int *out_len) {
+  unsigned int i;
+  uint8_t buf[EVP_MAX_MD_SIZE];
+
+  /* TODO(davidben): The only thing that can officially fail here is
+   * |EVP_MD_CTX_copy_ex|, but even that should be impossible in this case. */
+  if (!EVP_DigestFinal_ex(&ctx->md_ctx, buf, &i) ||
+      !EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->o_ctx) ||
+      !EVP_DigestUpdate(&ctx->md_ctx, buf, i) ||
+      !EVP_DigestFinal_ex(&ctx->md_ctx, out, out_len)) {
+    *out_len = 0;
+    return 0;
+  }
+
+  return 1;
+}
+
+size_t HMAC_size(const HMAC_CTX *ctx) {
+  return EVP_MD_size(ctx->md);
+}
+
+int HMAC_CTX_copy_ex(HMAC_CTX *dest, const HMAC_CTX *src) {
+  if (!EVP_MD_CTX_copy_ex(&dest->i_ctx, &src->i_ctx) ||
+      !EVP_MD_CTX_copy_ex(&dest->o_ctx, &src->o_ctx) ||
+      !EVP_MD_CTX_copy_ex(&dest->md_ctx, &src->md_ctx)) {
+    return 0;
+  }
+
+  dest->md = src->md;
+  return 1;
+}
+
+int HMAC_Init(HMAC_CTX *ctx, const void *key, int key_len, const EVP_MD *md) {
+  if (key && md) {
+    HMAC_CTX_init(ctx);
+  }
+  return HMAC_Init_ex(ctx, key, key_len, md, NULL);
+}
+
+int HMAC_CTX_copy(HMAC_CTX *dest, const HMAC_CTX *src) {
+  HMAC_CTX_init(dest);
+  return HMAC_CTX_copy_ex(dest, src);
+}

diff --git a/crypto/fipsmodule/inject-hash.go b/crypto/fipsmodule/inject-hash.go
new file mode 100644
index 0000000..1ccbdc9
--- /dev/null
+++ b/crypto/fipsmodule/inject-hash.go

@@ -0,0 +1,124 @@
+// Copyright (c) 2017, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// inject-hash runs a binary compiled against a FIPS module that hasn't had the
+// correct hash injected. That binary will fail the power-on integrity check
+// and write the calcualted hash value to stderr. This script parses that and
+// injects the calcualted value into the given object file.
+package main
+
+import (
+	"bytes"
+	"encoding/hex"
+	"errors"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"strings"
+)
+
+// uninitHashValue is the default hash value that we inject into the module.
+// This value need only be distinct, i.e. so that we can safely
+// search-and-replace it in an object file. This must match the value in bcm.c.
+var uninitHashValue = [32]byte{
+	0x5f, 0x30, 0xd1, 0x80, 0xe7, 0x9e, 0x8f, 0x8f, 0xdf, 0x8b, 0x93, 0xd4, 0x96, 0x36, 0x30, 0xcc, 0x30, 0xea, 0x38, 0x0f, 0x75, 0x56, 0x9a, 0x1b, 0x23, 0x2f, 0x7c, 0x79, 0xff, 0x1b, 0x2b, 0xca,
+}
+
+func do(outPath, arInput, binPath string) error {
+	cmd := exec.Command(binPath)
+	out, err := cmd.CombinedOutput()
+
+	if err == nil {
+		return errors.New("binary did not fail self test")
+	}
+
+	lines := strings.Split(string(out), "\n")
+	if len(lines) < 3 {
+		return fmt.Errorf("too few lines in output: %q", out)
+	}
+
+	calculatedLine := lines[2]
+	if !strings.HasPrefix(calculatedLine, "Calculated: ") {
+		return errors.New("bad prefix of 3rd line: " + calculatedLine)
+	}
+	calculatedLine = calculatedLine[12:]
+	calculated, err := hex.DecodeString(calculatedLine)
+	if err != nil {
+		return err
+	}
+
+	if len(calculated) != len(uninitHashValue) {
+		return fmt.Errorf("unexpected length of calculated hash: got %d, want %d", len(calculated), len(uninitHashValue))
+	}
+
+	arFile, err := os.Open(arInput)
+	if err != nil {
+		return err
+	}
+	defer arFile.Close()
+
+	ar := NewAR(arFile)
+
+	for {
+		header, err := ar.Next()
+		if err != nil {
+			return errors.New("error reading from archive file: " + err.Error())
+		}
+
+		if len(header.Name) > 0 {
+			break
+		}
+
+		if _, err := ioutil.ReadAll(ar); err != nil {
+			return errors.New("error reading from archive file: " + err.Error())
+		}
+	}
+
+	object, err := ioutil.ReadAll(ar)
+	if err != nil {
+		return err
+	}
+
+	offset := bytes.Index(object, uninitHashValue[:])
+	if offset < 0 {
+		return errors.New("did not find uninitialised hash value in object file")
+	}
+
+	if bytes.Index(object[offset+1:], uninitHashValue[:]) >= 0 {
+		return errors.New("found two occurrences of uninitialised hash value in object file")
+	}
+
+	copy(object[offset:], calculated)
+
+	if err := ioutil.WriteFile(outPath, object, 0644); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func main() {
+	arInput := flag.String("in", "", "Path to a .a file")
+	outPath := flag.String("o", "", "Path to output object")
+	bin := flag.String("bin", "", "Binary compiled with the FIPS module")
+
+	flag.Parse()
+
+	if err := do(*outPath, *arInput, *bin); err != nil {
+		fmt.Fprintf(os.Stderr, "%s\n", err)
+		os.Exit(1)
+	}
+}

diff --git a/crypto/fipsmodule/intcheck1.png b/crypto/fipsmodule/intcheck1.png
new file mode 100644
index 0000000..2fec4ea
--- /dev/null
+++ b/crypto/fipsmodule/intcheck1.png
Binary files differ

diff --git a/crypto/fipsmodule/intcheck2.png b/crypto/fipsmodule/intcheck2.png
new file mode 100644
index 0000000..ad1fe27
--- /dev/null
+++ b/crypto/fipsmodule/intcheck2.png
Binary files differ

diff --git a/crypto/fipsmodule/intcheck3.png b/crypto/fipsmodule/intcheck3.png
new file mode 100644
index 0000000..9a22d33
--- /dev/null
+++ b/crypto/fipsmodule/intcheck3.png
Binary files differ

diff --git a/crypto/fipsmodule/is_fips.c b/crypto/fipsmodule/is_fips.c
new file mode 100644
index 0000000..c23e621
--- /dev/null
+++ b/crypto/fipsmodule/is_fips.c

@@ -0,0 +1,27 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/crypto.h>
+
+
+/* This file exists in order to give the fipsmodule target, in non-FIPS mode,
+ * something to compile. */
+
+int FIPS_mode(void) {
+#if defined(BORINGSSL_FIPS)
+  return 1;
+#else
+  return 0;
+#endif
+}

diff --git a/crypto/fipsmodule/md4/md4.c b/crypto/fipsmodule/md4/md4.c
new file mode 100644
index 0000000..3028c8b
--- /dev/null
+++ b/crypto/fipsmodule/md4/md4.c

@@ -0,0 +1,254 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/md4.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../internal.h"
+
+
+uint8_t *MD4(const uint8_t *data, size_t len, uint8_t *out) {
+  MD4_CTX ctx;
+  MD4_Init(&ctx);
+  MD4_Update(&ctx, data, len);
+  MD4_Final(out, &ctx);
+
+  return out;
+}
+
+/* Implemented from RFC1186 The MD4 Message-Digest Algorithm. */
+
+int MD4_Init(MD4_CTX *md4) {
+  OPENSSL_memset(md4, 0, sizeof(MD4_CTX));
+  md4->h[0] = 0x67452301UL;
+  md4->h[1] = 0xefcdab89UL;
+  md4->h[2] = 0x98badcfeUL;
+  md4->h[3] = 0x10325476UL;
+  return 1;
+}
+
+void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
+
+#define DATA_ORDER_IS_LITTLE_ENDIAN
+
+#define HASH_CTX MD4_CTX
+#define HASH_CBLOCK 64
+#define HASH_UPDATE MD4_Update
+#define HASH_TRANSFORM MD4_Transform
+#define HASH_FINAL MD4_Final
+#define HASH_MAKE_STRING(c, s) \
+  do {                         \
+    uint32_t ll;               \
+    ll = (c)->h[0];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[1];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[2];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[3];            \
+    HOST_l2c(ll, (s));         \
+  } while (0)
+#define HASH_BLOCK_DATA_ORDER md4_block_data_order
+
+#include "../digest/md32_common.h"
+
+/* As pointed out by Wei Dai <weidai@eskimo.com>, the above can be
+ * simplified to the code below.  Wei attributes these optimizations
+ * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. */
+#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define G(b, c, d) (((b) & (c)) | ((b) & (d)) | ((c) & (d)))
+#define H(b, c, d) ((b) ^ (c) ^ (d))
+
+#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
+
+#define R0(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + F((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+  } while (0)
+
+#define R1(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + G((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+  } while (0)
+
+#define R2(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + H((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+  } while (0)
+
+void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
+  uint32_t A, B, C, D, l;
+  uint32_t X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15;
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+
+  for (; num--;) {
+    HOST_c2l(data, l);
+    X0 = l;
+    HOST_c2l(data, l);
+    X1 = l;
+    /* Round 0 */
+    R0(A, B, C, D, X0, 3, 0);
+    HOST_c2l(data, l);
+    X2 = l;
+    R0(D, A, B, C, X1, 7, 0);
+    HOST_c2l(data, l);
+    X3 = l;
+    R0(C, D, A, B, X2, 11, 0);
+    HOST_c2l(data, l);
+    X4 = l;
+    R0(B, C, D, A, X3, 19, 0);
+    HOST_c2l(data, l);
+    X5 = l;
+    R0(A, B, C, D, X4, 3, 0);
+    HOST_c2l(data, l);
+    X6 = l;
+    R0(D, A, B, C, X5, 7, 0);
+    HOST_c2l(data, l);
+    X7 = l;
+    R0(C, D, A, B, X6, 11, 0);
+    HOST_c2l(data, l);
+    X8 = l;
+    R0(B, C, D, A, X7, 19, 0);
+    HOST_c2l(data, l);
+    X9 = l;
+    R0(A, B, C, D, X8, 3, 0);
+    HOST_c2l(data, l);
+    X10 = l;
+    R0(D, A, B, C, X9, 7, 0);
+    HOST_c2l(data, l);
+    X11 = l;
+    R0(C, D, A, B, X10, 11, 0);
+    HOST_c2l(data, l);
+    X12 = l;
+    R0(B, C, D, A, X11, 19, 0);
+    HOST_c2l(data, l);
+    X13 = l;
+    R0(A, B, C, D, X12, 3, 0);
+    HOST_c2l(data, l);
+    X14 = l;
+    R0(D, A, B, C, X13, 7, 0);
+    HOST_c2l(data, l);
+    X15 = l;
+    R0(C, D, A, B, X14, 11, 0);
+    R0(B, C, D, A, X15, 19, 0);
+    /* Round 1 */
+    R1(A, B, C, D, X0, 3, 0x5A827999L);
+    R1(D, A, B, C, X4, 5, 0x5A827999L);
+    R1(C, D, A, B, X8, 9, 0x5A827999L);
+    R1(B, C, D, A, X12, 13, 0x5A827999L);
+    R1(A, B, C, D, X1, 3, 0x5A827999L);
+    R1(D, A, B, C, X5, 5, 0x5A827999L);
+    R1(C, D, A, B, X9, 9, 0x5A827999L);
+    R1(B, C, D, A, X13, 13, 0x5A827999L);
+    R1(A, B, C, D, X2, 3, 0x5A827999L);
+    R1(D, A, B, C, X6, 5, 0x5A827999L);
+    R1(C, D, A, B, X10, 9, 0x5A827999L);
+    R1(B, C, D, A, X14, 13, 0x5A827999L);
+    R1(A, B, C, D, X3, 3, 0x5A827999L);
+    R1(D, A, B, C, X7, 5, 0x5A827999L);
+    R1(C, D, A, B, X11, 9, 0x5A827999L);
+    R1(B, C, D, A, X15, 13, 0x5A827999L);
+    /* Round 2 */
+    R2(A, B, C, D, X0, 3, 0x6ED9EBA1L);
+    R2(D, A, B, C, X8, 9, 0x6ED9EBA1L);
+    R2(C, D, A, B, X4, 11, 0x6ED9EBA1L);
+    R2(B, C, D, A, X12, 15, 0x6ED9EBA1L);
+    R2(A, B, C, D, X2, 3, 0x6ED9EBA1L);
+    R2(D, A, B, C, X10, 9, 0x6ED9EBA1L);
+    R2(C, D, A, B, X6, 11, 0x6ED9EBA1L);
+    R2(B, C, D, A, X14, 15, 0x6ED9EBA1L);
+    R2(A, B, C, D, X1, 3, 0x6ED9EBA1L);
+    R2(D, A, B, C, X9, 9, 0x6ED9EBA1L);
+    R2(C, D, A, B, X5, 11, 0x6ED9EBA1L);
+    R2(B, C, D, A, X13, 15, 0x6ED9EBA1L);
+    R2(A, B, C, D, X3, 3, 0x6ED9EBA1L);
+    R2(D, A, B, C, X11, 9, 0x6ED9EBA1L);
+    R2(C, D, A, B, X7, 11, 0x6ED9EBA1L);
+    R2(B, C, D, A, X15, 15, 0x6ED9EBA1L);
+
+    A = state[0] += A;
+    B = state[1] += B;
+    C = state[2] += C;
+    D = state[3] += D;
+  }
+}
+
+#undef DATA_ORDER_IS_LITTLE_ENDIAN
+#undef HASH_CTX
+#undef HASH_CBLOCK
+#undef HASH_UPDATE
+#undef HASH_TRANSFORM
+#undef HASH_FINAL
+#undef HASH_MAKE_STRING
+#undef HASH_BLOCK_DATA_ORDER
+#undef F
+#undef G
+#undef H
+#undef ROTATE
+#undef R0
+#undef R1
+#undef R2
+#undef HOST_c2l
+#undef HOST_l2c

diff --git a/crypto/fipsmodule/md5/asm/md5-586.pl b/crypto/fipsmodule/md5/asm/md5-586.pl
new file mode 100644
index 0000000..4ba08ec
--- /dev/null
+++ b/crypto/fipsmodule/md5/asm/md5-586.pl

@@ -0,0 +1,312 @@
+#!/usr/local/bin/perl
+
+# Normal is the
+# md5_block_x86(MD5_CTX *c, ULONG *X);
+# version, non-normal is the
+# md5_block_x86(MD5_CTX *c, ULONG *X,int blocks);
+
+$normal=0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$0);
+
+$A="eax";
+$B="ebx";
+$C="ecx";
+$D="edx";
+$tmp1="edi";
+$tmp2="ebp";
+$X="esi";
+
+# What we need to load into $tmp for the next round
+%Ltmp1=("R0",&Np($C), "R1",&Np($C), "R2",&Np($C), "R3",&Np($D));
+@xo=(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,	# R0
+ 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12,	# R1
+ 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2,	# R2
+ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,	# R3
+ );
+
+&md5_block("md5_block_asm_data_order");
+&asm_finish();
+
+close STDOUT;
+
+sub Np
+	{
+	local($p)=@_;
+	local(%n)=($A,$D,$B,$A,$C,$B,$D,$C);
+	return($n{$p});
+	}
+
+sub R0
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&mov($tmp1,$C)  if $pos < 0;
+	&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
+
+	# body proper
+
+	&comment("R0 $ki");
+	&xor($tmp1,$d); # F function - part 2
+
+	&and($tmp1,$b); # F function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$d); # F function - part 4
+
+	&add($a,$tmp1);
+	&mov($tmp1,&Np($c)) if $pos < 1;	# next tmp1 for R0
+	&mov($tmp1,&Np($c)) if $pos == 1;	# next tmp1 for R1
+
+	&rotl($a,$s);
+
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+
+	&add($a,$b);
+	}
+
+sub R1
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&comment("R1 $ki");
+
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$b); # G function - part 2
+	&and($tmp1,$d); # G function - part 3
+
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+	&xor($tmp1,$c);			# G function - part 4
+
+	&add($a,$tmp1);
+	&mov($tmp1,&Np($c)) if $pos < 1;	# G function - part 1
+	&mov($tmp1,&Np($c)) if $pos == 1;	# G function - part 1
+
+	&rotl($a,$s);
+
+	&add($a,$b);
+	}
+
+sub R2
+	{
+	local($n,$pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+	# This one is different, only 3 logical operations
+
+if (($n & 1) == 0)
+	{
+	&comment("R2 $ki");
+	# make sure to do 'D' first, not 'B', else we clash with
+	# the last add from the previous round.
+
+	&xor($tmp1,$d); # H function - part 2
+
+	&xor($tmp1,$b); # H function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&add($a,$tmp1);
+
+	&rotl($a,$s);
+
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0));
+	&mov($tmp1,&Np($c));
+	}
+else
+	{
+	&comment("R2 $ki");
+	# make sure to do 'D' first, not 'B', else we clash with
+	# the last add from the previous round.
+
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&add($b,$c);			# MOVED FORWARD
+	&xor($tmp1,$d); # H function - part 2
+
+	&xor($tmp1,$b); # H function - part 3
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+
+	&add($a,$tmp1);
+	&mov($tmp1,&Np($c)) if $pos < 1;	# H function - part 1
+	&mov($tmp1,-1) if $pos == 1;		# I function - part 1
+
+	&rotl($a,$s);
+
+	&add($a,$b);
+	}
+	}
+
+sub R3
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&comment("R3 $ki");
+
+	# &not($tmp1)
+	&xor($tmp1,$d) if $pos < 0; 	# I function - part 2
+
+	&or($tmp1,$b);				# I function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$c); 			# I function - part 4
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0))	if $pos != 2; # load X/k value
+	&mov($tmp2,&wparam(0)) if $pos == 2;
+
+	&add($a,$tmp1);
+	&mov($tmp1,-1) if $pos < 1;	# H function - part 1
+	&add($K,64) if $pos >=1 && !$normal;
+
+	&rotl($a,$s);
+
+	&xor($tmp1,&Np($d)) if $pos <= 0; 	# I function - part = first time
+	&mov($tmp1,&DWP( 0,$tmp2,"",0)) if $pos > 0;
+	&add($a,$b);
+	}
+
+
+sub md5_block
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"",3);
+
+	# parameter 1 is the MD5_CTX structure.
+	# A	0
+	# B	4
+	# C	8
+	# D 	12
+
+	&push("esi");
+	 &push("edi");
+	&mov($tmp1,	&wparam(0)); # edi
+	 &mov($X,	&wparam(1)); # esi
+	&mov($C,	&wparam(2));
+	 &push("ebp");
+	&shl($C,	6);
+	&push("ebx");
+	 &add($C,	$X); # offset we end at
+	&sub($C,	64);
+	 &mov($A,	&DWP( 0,$tmp1,"",0));
+	&push($C);	# Put on the TOS
+	 &mov($B,	&DWP( 4,$tmp1,"",0));
+	&mov($C,	&DWP( 8,$tmp1,"",0));
+	 &mov($D,	&DWP(12,$tmp1,"",0));
+
+	&set_label("start") unless $normal;
+	&comment("");
+	&comment("R0 section");
+
+	&R0(-2,$A,$B,$C,$D,$X, 0, 7,0xd76aa478);
+	&R0( 0,$D,$A,$B,$C,$X, 1,12,0xe8c7b756);
+	&R0( 0,$C,$D,$A,$B,$X, 2,17,0x242070db);
+	&R0( 0,$B,$C,$D,$A,$X, 3,22,0xc1bdceee);
+	&R0( 0,$A,$B,$C,$D,$X, 4, 7,0xf57c0faf);
+	&R0( 0,$D,$A,$B,$C,$X, 5,12,0x4787c62a);
+	&R0( 0,$C,$D,$A,$B,$X, 6,17,0xa8304613);
+	&R0( 0,$B,$C,$D,$A,$X, 7,22,0xfd469501);
+	&R0( 0,$A,$B,$C,$D,$X, 8, 7,0x698098d8);
+	&R0( 0,$D,$A,$B,$C,$X, 9,12,0x8b44f7af);
+	&R0( 0,$C,$D,$A,$B,$X,10,17,0xffff5bb1);
+	&R0( 0,$B,$C,$D,$A,$X,11,22,0x895cd7be);
+	&R0( 0,$A,$B,$C,$D,$X,12, 7,0x6b901122);
+	&R0( 0,$D,$A,$B,$C,$X,13,12,0xfd987193);
+	&R0( 0,$C,$D,$A,$B,$X,14,17,0xa679438e);
+	&R0( 1,$B,$C,$D,$A,$X,15,22,0x49b40821);
+
+	&comment("");
+	&comment("R1 section");
+	&R1(-1,$A,$B,$C,$D,$X,16, 5,0xf61e2562);
+	&R1( 0,$D,$A,$B,$C,$X,17, 9,0xc040b340);
+	&R1( 0,$C,$D,$A,$B,$X,18,14,0x265e5a51);
+	&R1( 0,$B,$C,$D,$A,$X,19,20,0xe9b6c7aa);
+	&R1( 0,$A,$B,$C,$D,$X,20, 5,0xd62f105d);
+	&R1( 0,$D,$A,$B,$C,$X,21, 9,0x02441453);
+	&R1( 0,$C,$D,$A,$B,$X,22,14,0xd8a1e681);
+	&R1( 0,$B,$C,$D,$A,$X,23,20,0xe7d3fbc8);
+	&R1( 0,$A,$B,$C,$D,$X,24, 5,0x21e1cde6);
+	&R1( 0,$D,$A,$B,$C,$X,25, 9,0xc33707d6);
+	&R1( 0,$C,$D,$A,$B,$X,26,14,0xf4d50d87);
+	&R1( 0,$B,$C,$D,$A,$X,27,20,0x455a14ed);
+	&R1( 0,$A,$B,$C,$D,$X,28, 5,0xa9e3e905);
+	&R1( 0,$D,$A,$B,$C,$X,29, 9,0xfcefa3f8);
+	&R1( 0,$C,$D,$A,$B,$X,30,14,0x676f02d9);
+	&R1( 1,$B,$C,$D,$A,$X,31,20,0x8d2a4c8a);
+
+	&comment("");
+	&comment("R2 section");
+	&R2( 0,-1,$A,$B,$C,$D,$X,32, 4,0xfffa3942);
+	&R2( 1, 0,$D,$A,$B,$C,$X,33,11,0x8771f681);
+	&R2( 2, 0,$C,$D,$A,$B,$X,34,16,0x6d9d6122);
+	&R2( 3, 0,$B,$C,$D,$A,$X,35,23,0xfde5380c);
+	&R2( 4, 0,$A,$B,$C,$D,$X,36, 4,0xa4beea44);
+	&R2( 5, 0,$D,$A,$B,$C,$X,37,11,0x4bdecfa9);
+	&R2( 6, 0,$C,$D,$A,$B,$X,38,16,0xf6bb4b60);
+	&R2( 7, 0,$B,$C,$D,$A,$X,39,23,0xbebfbc70);
+	&R2( 8, 0,$A,$B,$C,$D,$X,40, 4,0x289b7ec6);
+	&R2( 9, 0,$D,$A,$B,$C,$X,41,11,0xeaa127fa);
+	&R2(10, 0,$C,$D,$A,$B,$X,42,16,0xd4ef3085);
+	&R2(11, 0,$B,$C,$D,$A,$X,43,23,0x04881d05);
+	&R2(12, 0,$A,$B,$C,$D,$X,44, 4,0xd9d4d039);
+	&R2(13, 0,$D,$A,$B,$C,$X,45,11,0xe6db99e5);
+	&R2(14, 0,$C,$D,$A,$B,$X,46,16,0x1fa27cf8);
+	&R2(15, 1,$B,$C,$D,$A,$X,47,23,0xc4ac5665);
+
+	&comment("");
+	&comment("R3 section");
+	&R3(-1,$A,$B,$C,$D,$X,48, 6,0xf4292244);
+	&R3( 0,$D,$A,$B,$C,$X,49,10,0x432aff97);
+	&R3( 0,$C,$D,$A,$B,$X,50,15,0xab9423a7);
+	&R3( 0,$B,$C,$D,$A,$X,51,21,0xfc93a039);
+	&R3( 0,$A,$B,$C,$D,$X,52, 6,0x655b59c3);
+	&R3( 0,$D,$A,$B,$C,$X,53,10,0x8f0ccc92);
+	&R3( 0,$C,$D,$A,$B,$X,54,15,0xffeff47d);
+	&R3( 0,$B,$C,$D,$A,$X,55,21,0x85845dd1);
+	&R3( 0,$A,$B,$C,$D,$X,56, 6,0x6fa87e4f);
+	&R3( 0,$D,$A,$B,$C,$X,57,10,0xfe2ce6e0);
+	&R3( 0,$C,$D,$A,$B,$X,58,15,0xa3014314);
+	&R3( 0,$B,$C,$D,$A,$X,59,21,0x4e0811a1);
+	&R3( 0,$A,$B,$C,$D,$X,60, 6,0xf7537e82);
+	&R3( 0,$D,$A,$B,$C,$X,61,10,0xbd3af235);
+	&R3( 0,$C,$D,$A,$B,$X,62,15,0x2ad7d2bb);
+	&R3( 2,$B,$C,$D,$A,$X,63,21,0xeb86d391);
+
+	# &mov($tmp2,&wparam(0));	# done in the last R3
+	# &mov($tmp1,	&DWP( 0,$tmp2,"",0)); # done is the last R3
+
+	&add($A,$tmp1);
+	 &mov($tmp1,	&DWP( 4,$tmp2,"",0));
+
+	&add($B,$tmp1);
+	&mov($tmp1,	&DWP( 8,$tmp2,"",0));
+
+	&add($C,$tmp1);
+	&mov($tmp1,	&DWP(12,$tmp2,"",0));
+
+	&add($D,$tmp1);
+	&mov(&DWP( 0,$tmp2,"",0),$A);
+
+	&mov(&DWP( 4,$tmp2,"",0),$B);
+	&mov($tmp1,&swtmp(0)) unless $normal;
+
+	&mov(&DWP( 8,$tmp2,"",0),$C);
+	 &mov(&DWP(12,$tmp2,"",0),$D);
+
+	&cmp($tmp1,$X) unless $normal;			# check count
+	 &jae(&label("start")) unless $normal;
+
+	&pop("eax"); # pop the temp variable off the stack
+	 &pop("ebx");
+	&pop("ebp");
+	 &pop("edi");
+	&pop("esi");
+	 &ret();
+	&function_end_B($name);
+	}
+

diff --git a/crypto/fipsmodule/md5/asm/md5-x86_64.pl b/crypto/fipsmodule/md5/asm/md5-x86_64.pl
new file mode 100644
index 0000000..568f4f5
--- /dev/null
+++ b/crypto/fipsmodule/md5/asm/md5-x86_64.pl

@@ -0,0 +1,370 @@
+#!/usr/bin/perl -w
+#
+# MD5 optimized for AMD64.
+#
+# Author: Marc Bevand <bevand_m (at) epita.fr>
+# Licence: I hereby disclaim the copyright on this code and place it
+# in the public domain.
+#
+
+use strict;
+
+my $code;
+
+# round1_step() does:
+#   dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = z' (copy of z for the next step)
+# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
+sub round1_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
+    $code .= " mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	xor	$y,		%r11d		/* y ^ ... */
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	and	$x,		%r11d		/* x & ... */
+	xor	$z,		%r11d		/* z ^ ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	add	%r11d,		$dst		/* dst += ... */
+	rol	\$$s,		$dst		/* dst <<< s */
+	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+# round2_step() does:
+#   dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = z' (copy of z for the next step)
+#   %r12d = z' (copy of z for the next step)
+# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
+sub round2_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	1*4(%rsi),	%r10d		/* (NEXT STEP) X[1] */\n" if ($pos == -1);
+    $code .= " mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= " mov	%edx,		%r12d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	not	%r11d				/* not z */
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	and	$x,		%r12d		/* x & z */
+	and	$y,		%r11d		/* y & (not z) */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	or	%r11d,		%r12d		/* (y & (not z)) | (x & z) */
+	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
+	add	%r12d,		$dst		/* dst += ... */
+	mov	$y,		%r12d		/* (NEXT STEP) z' = $y */
+	rol	\$$s,		$dst		/* dst <<< s */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+# round3_step() does:
+#   dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = y' (copy of y for the next step)
+# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
+sub round3_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	5*4(%rsi),	%r10d		/* (NEXT STEP) X[5] */\n" if ($pos == -1);
+    $code .= " mov	%ecx,		%r11d		/* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	xor	$z,		%r11d		/* z ^ ... */
+	xor	$x,		%r11d		/* x ^ ... */
+	add	%r11d,		$dst		/* dst += ... */
+	rol	\$$s,		$dst		/* dst <<< s */
+	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+# round4_step() does:
+#   dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = not z' (copy of not z for the next step)
+# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
+sub round4_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
+    $code .= " mov	\$0xffffffff,	%r11d\n" if ($pos == -1);
+    $code .= " xor	%edx,		%r11d		/* (NEXT STEP) not z' = not %edx*/\n"
+    if ($pos == -1);
+    $code .= <<EOF;
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	or	$x,		%r11d		/* x | ... */
+	xor	$y,		%r11d		/* y ^ ... */
+	add	%r11d,		$dst		/* dst += ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	mov	\$0xffffffff,	%r11d
+	rol	\$$s,		$dst		/* dst <<< s */
+	xor	$y,		%r11d		/* (NEXT STEP) not z' = not $y */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+no warnings qw(uninitialized);
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$code .= <<EOF;
+.text
+.align 16
+
+.globl md5_block_asm_data_order
+.type md5_block_asm_data_order,\@function,3
+md5_block_asm_data_order:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r14
+	push	%r15
+.Lprologue:
+
+	# rdi = arg #1 (ctx, MD5_CTX pointer)
+	# rsi = arg #2 (ptr, data pointer)
+	# rdx = arg #3 (nbr, number of 16-word blocks to process)
+	mov	%rdi,		%rbp	# rbp = ctx
+	shl	\$6,		%rdx	# rdx = nbr in bytes
+	lea	(%rsi,%rdx),	%rdi	# rdi = end
+	mov	0*4(%rbp),	%eax	# eax = ctx->A
+	mov	1*4(%rbp),	%ebx	# ebx = ctx->B
+	mov	2*4(%rbp),	%ecx	# ecx = ctx->C
+	mov	3*4(%rbp),	%edx	# edx = ctx->D
+	# end is 'rdi'
+	# ptr is 'rsi'
+	# A is 'eax'
+	# B is 'ebx'
+	# C is 'ecx'
+	# D is 'edx'
+
+	cmp	%rdi,		%rsi		# cmp end with ptr
+	je	.Lend				# jmp if ptr == end
+
+	# BEGIN of loop over 16-word blocks
+.Lloop:	# save old values of A, B, C, D
+	mov	%eax,		%r8d
+	mov	%ebx,		%r9d
+	mov	%ecx,		%r14d
+	mov	%edx,		%r15d
+EOF
+round1_step(-1,'%eax','%ebx','%ecx','%edx', '1','0xd76aa478', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xe8c7b756','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx', '3','0x242070db','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax', '4','0xc1bdceee','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx', '5','0xf57c0faf', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx', '6','0x4787c62a','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx', '7','0xa8304613','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax', '8','0xfd469501','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx', '9','0x698098d8', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8b44f7af','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx','11','0xffff5bb1','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax','12','0x895cd7be','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx','13','0x6b901122', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx','14','0xfd987193','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx','15','0xa679438e','17');
+round1_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x49b40821','22');
+
+round2_step(-1,'%eax','%ebx','%ecx','%edx', '6','0xf61e2562', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx','11','0xc040b340', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '0','0x265e5a51','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax', '5','0xe9b6c7aa','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx','10','0xd62f105d', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx','15', '0x2441453', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '4','0xd8a1e681','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax', '9','0xe7d3fbc8','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx','14','0x21e1cde6', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xc33707d6', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '8','0xf4d50d87','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax','13','0x455a14ed','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx', '2','0xa9e3e905', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx', '7','0xfcefa3f8', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx','12','0x676f02d9','14');
+round2_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x8d2a4c8a','20');
+
+round3_step(-1,'%eax','%ebx','%ecx','%edx', '8','0xfffa3942', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx','11','0x8771f681','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx','14','0x6d9d6122','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax', '1','0xfde5380c','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx', '4','0xa4beea44', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx', '7','0x4bdecfa9','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx','10','0xf6bb4b60','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax','13','0xbebfbc70','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx', '0','0x289b7ec6', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xeaa127fa','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx', '6','0xd4ef3085','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax', '9', '0x4881d05','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx','12','0xd9d4d039', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx','15','0xe6db99e5','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx', '2','0x1fa27cf8','16');
+round3_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xc4ac5665','23');
+
+round4_step(-1,'%eax','%ebx','%ecx','%edx', '7','0xf4292244', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx','14','0x432aff97','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '5','0xab9423a7','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax','12','0xfc93a039','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx', '3','0x655b59c3', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8f0ccc92','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '1','0xffeff47d','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax', '8','0x85845dd1','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx','15','0x6fa87e4f', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx', '6','0xfe2ce6e0','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx','13','0xa3014314','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax', '4','0x4e0811a1','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx','11','0xf7537e82', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xbd3af235','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '9','0x2ad7d2bb','15');
+round4_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xeb86d391','21');
+$code .= <<EOF;
+	# add old values of A, B, C, D
+	add	%r8d,	%eax
+	add	%r9d,	%ebx
+	add	%r14d,	%ecx
+	add	%r15d,	%edx
+
+	# loop control
+	add	\$64,		%rsi		# ptr += 64
+	cmp	%rdi,		%rsi		# cmp end with ptr
+	jb	.Lloop				# jmp if ptr < end
+	# END of loop over 16-word blocks
+
+.Lend:
+	mov	%eax,		0*4(%rbp)	# ctx->A = A
+	mov	%ebx,		1*4(%rbp)	# ctx->B = B
+	mov	%ecx,		2*4(%rbp)	# ctx->C = C
+	mov	%edx,		3*4(%rbp)	# ctx->D = D
+
+	mov	(%rsp),%r15
+	mov	8(%rsp),%r14
+	mov	16(%rsp),%r12
+	mov	24(%rsp),%rbx
+	mov	32(%rsp),%rbp
+	add	\$40,%rsp
+.Lepilogue:
+	ret
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
+EOF
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	lea	40(%rax),%rax
+
+	mov	-8(%rax),%rbp
+	mov	-16(%rax),%rbx
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r14
+	mov	-40(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_md5_block_asm_data_order
+	.rva	.LSEH_end_md5_block_asm_data_order
+	.rva	.LSEH_info_md5_block_asm_data_order
+
+.section	.xdata
+.align	8
+.LSEH_info_md5_block_asm_data_order:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+print $code;
+
+close STDOUT;

diff --git a/crypto/fipsmodule/md5/md5.c b/crypto/fipsmodule/md5/md5.c
new file mode 100644
index 0000000..15a0f53
--- /dev/null
+++ b/crypto/fipsmodule/md5/md5.c

@@ -0,0 +1,299 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/md5.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+uint8_t *MD5(const uint8_t *data, size_t len, uint8_t *out) {
+  MD5_CTX ctx;
+  MD5_Init(&ctx);
+  MD5_Update(&ctx, data, len);
+  MD5_Final(out, &ctx);
+
+  return out;
+}
+
+int MD5_Init(MD5_CTX *md5) {
+  OPENSSL_memset(md5, 0, sizeof(MD5_CTX));
+  md5->h[0] = 0x67452301UL;
+  md5->h[1] = 0xefcdab89UL;
+  md5->h[2] = 0x98badcfeUL;
+  md5->h[3] = 0x10325476UL;
+  return 1;
+}
+
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
+#define MD5_ASM
+#define md5_block_data_order md5_block_asm_data_order
+#endif
+
+
+void md5_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
+
+#define DATA_ORDER_IS_LITTLE_ENDIAN
+
+#define HASH_CTX MD5_CTX
+#define HASH_CBLOCK 64
+#define HASH_UPDATE MD5_Update
+#define HASH_TRANSFORM MD5_Transform
+#define HASH_FINAL MD5_Final
+#define HASH_MAKE_STRING(c, s) \
+  do {                         \
+    uint32_t ll;               \
+    ll = (c)->h[0];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[1];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[2];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[3];            \
+    HOST_l2c(ll, (s));         \
+  } while (0)
+#define HASH_BLOCK_DATA_ORDER md5_block_data_order
+
+#include "../digest/md32_common.h"
+
+/* As pointed out by Wei Dai <weidai@eskimo.com>, the above can be
+ * simplified to the code below.  Wei attributes these optimizations
+ * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
+ */
+#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define G(b, c, d) ((((b) ^ (c)) & (d)) ^ (c))
+#define H(b, c, d) ((b) ^ (c) ^ (d))
+#define I(b, c, d) (((~(d)) | (b)) ^ (c))
+
+#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
+
+#define R0(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + F((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+    (a) += (b);                            \
+  } while (0)
+
+#define R1(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + G((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+    (a) += (b);                            \
+  } while (0)
+
+#define R2(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + H((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+    (a) += (b);                            \
+  } while (0)
+
+#define R3(a, b, c, d, k, s, t)            \
+  do {                                     \
+    (a) += ((k) + (t) + I((b), (c), (d))); \
+    (a) = ROTATE(a, s);                    \
+    (a) += (b);                            \
+  } while (0)
+
+#ifndef md5_block_data_order
+#ifdef X
+#undef X
+#endif
+void md5_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
+  uint32_t A, B, C, D, l;
+  uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12,
+      XX13, XX14, XX15;
+#define X(i) XX##i
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+
+  for (; num--;) {
+    HOST_c2l(data, l);
+    X(0) = l;
+    HOST_c2l(data, l);
+    X(1) = l;
+    /* Round 0 */
+    R0(A, B, C, D, X(0), 7, 0xd76aa478L);
+    HOST_c2l(data, l);
+    X(2) = l;
+    R0(D, A, B, C, X(1), 12, 0xe8c7b756L);
+    HOST_c2l(data, l);
+    X(3) = l;
+    R0(C, D, A, B, X(2), 17, 0x242070dbL);
+    HOST_c2l(data, l);
+    X(4) = l;
+    R0(B, C, D, A, X(3), 22, 0xc1bdceeeL);
+    HOST_c2l(data, l);
+    X(5) = l;
+    R0(A, B, C, D, X(4), 7, 0xf57c0fafL);
+    HOST_c2l(data, l);
+    X(6) = l;
+    R0(D, A, B, C, X(5), 12, 0x4787c62aL);
+    HOST_c2l(data, l);
+    X(7) = l;
+    R0(C, D, A, B, X(6), 17, 0xa8304613L);
+    HOST_c2l(data, l);
+    X(8) = l;
+    R0(B, C, D, A, X(7), 22, 0xfd469501L);
+    HOST_c2l(data, l);
+    X(9) = l;
+    R0(A, B, C, D, X(8), 7, 0x698098d8L);
+    HOST_c2l(data, l);
+    X(10) = l;
+    R0(D, A, B, C, X(9), 12, 0x8b44f7afL);
+    HOST_c2l(data, l);
+    X(11) = l;
+    R0(C, D, A, B, X(10), 17, 0xffff5bb1L);
+    HOST_c2l(data, l);
+    X(12) = l;
+    R0(B, C, D, A, X(11), 22, 0x895cd7beL);
+    HOST_c2l(data, l);
+    X(13) = l;
+    R0(A, B, C, D, X(12), 7, 0x6b901122L);
+    HOST_c2l(data, l);
+    X(14) = l;
+    R0(D, A, B, C, X(13), 12, 0xfd987193L);
+    HOST_c2l(data, l);
+    X(15) = l;
+    R0(C, D, A, B, X(14), 17, 0xa679438eL);
+    R0(B, C, D, A, X(15), 22, 0x49b40821L);
+    /* Round 1 */
+    R1(A, B, C, D, X(1), 5, 0xf61e2562L);
+    R1(D, A, B, C, X(6), 9, 0xc040b340L);
+    R1(C, D, A, B, X(11), 14, 0x265e5a51L);
+    R1(B, C, D, A, X(0), 20, 0xe9b6c7aaL);
+    R1(A, B, C, D, X(5), 5, 0xd62f105dL);
+    R1(D, A, B, C, X(10), 9, 0x02441453L);
+    R1(C, D, A, B, X(15), 14, 0xd8a1e681L);
+    R1(B, C, D, A, X(4), 20, 0xe7d3fbc8L);
+    R1(A, B, C, D, X(9), 5, 0x21e1cde6L);
+    R1(D, A, B, C, X(14), 9, 0xc33707d6L);
+    R1(C, D, A, B, X(3), 14, 0xf4d50d87L);
+    R1(B, C, D, A, X(8), 20, 0x455a14edL);
+    R1(A, B, C, D, X(13), 5, 0xa9e3e905L);
+    R1(D, A, B, C, X(2), 9, 0xfcefa3f8L);
+    R1(C, D, A, B, X(7), 14, 0x676f02d9L);
+    R1(B, C, D, A, X(12), 20, 0x8d2a4c8aL);
+    /* Round 2 */
+    R2(A, B, C, D, X(5), 4, 0xfffa3942L);
+    R2(D, A, B, C, X(8), 11, 0x8771f681L);
+    R2(C, D, A, B, X(11), 16, 0x6d9d6122L);
+    R2(B, C, D, A, X(14), 23, 0xfde5380cL);
+    R2(A, B, C, D, X(1), 4, 0xa4beea44L);
+    R2(D, A, B, C, X(4), 11, 0x4bdecfa9L);
+    R2(C, D, A, B, X(7), 16, 0xf6bb4b60L);
+    R2(B, C, D, A, X(10), 23, 0xbebfbc70L);
+    R2(A, B, C, D, X(13), 4, 0x289b7ec6L);
+    R2(D, A, B, C, X(0), 11, 0xeaa127faL);
+    R2(C, D, A, B, X(3), 16, 0xd4ef3085L);
+    R2(B, C, D, A, X(6), 23, 0x04881d05L);
+    R2(A, B, C, D, X(9), 4, 0xd9d4d039L);
+    R2(D, A, B, C, X(12), 11, 0xe6db99e5L);
+    R2(C, D, A, B, X(15), 16, 0x1fa27cf8L);
+    R2(B, C, D, A, X(2), 23, 0xc4ac5665L);
+    /* Round 3 */
+    R3(A, B, C, D, X(0), 6, 0xf4292244L);
+    R3(D, A, B, C, X(7), 10, 0x432aff97L);
+    R3(C, D, A, B, X(14), 15, 0xab9423a7L);
+    R3(B, C, D, A, X(5), 21, 0xfc93a039L);
+    R3(A, B, C, D, X(12), 6, 0x655b59c3L);
+    R3(D, A, B, C, X(3), 10, 0x8f0ccc92L);
+    R3(C, D, A, B, X(10), 15, 0xffeff47dL);
+    R3(B, C, D, A, X(1), 21, 0x85845dd1L);
+    R3(A, B, C, D, X(8), 6, 0x6fa87e4fL);
+    R3(D, A, B, C, X(15), 10, 0xfe2ce6e0L);
+    R3(C, D, A, B, X(6), 15, 0xa3014314L);
+    R3(B, C, D, A, X(13), 21, 0x4e0811a1L);
+    R3(A, B, C, D, X(4), 6, 0xf7537e82L);
+    R3(D, A, B, C, X(11), 10, 0xbd3af235L);
+    R3(C, D, A, B, X(2), 15, 0x2ad7d2bbL);
+    R3(B, C, D, A, X(9), 21, 0xeb86d391L);
+
+    A = state[0] += A;
+    B = state[1] += B;
+    C = state[2] += C;
+    D = state[3] += D;
+  }
+}
+#undef X
+#endif
+
+#undef DATA_ORDER_IS_LITTLE_ENDIAN
+#undef HASH_CTX
+#undef HASH_CBLOCK
+#undef HASH_UPDATE
+#undef HASH_TRANSFORM
+#undef HASH_FINAL
+#undef HASH_MAKE_STRING
+#undef HASH_BLOCK_DATA_ORDER
+#undef F
+#undef G
+#undef H
+#undef I
+#undef ROTATE
+#undef R0
+#undef R1
+#undef R2
+#undef R3
+#undef HOST_c2l
+#undef HOST_l2c

diff --git a/crypto/fipsmodule/sha/asm/sha1-586.pl b/crypto/fipsmodule/sha/asm/sha1-586.pl
new file mode 100644
index 0000000..4221e27
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha1-586.pl

@@ -0,0 +1,1480 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
+# functions were re-implemented to address P4 performance issue [see
+# commentary below], and in 2006 the rest was rewritten in order to
+# gain freedom to liberate licensing terms.
+
+# January, September 2004.
+#
+# It was noted that Intel IA-32 C compiler generates code which
+# performs ~30% *faster* on P4 CPU than original *hand-coded*
+# SHA1 assembler implementation. To address this problem (and
+# prove that humans are still better than machines:-), the
+# original code was overhauled, which resulted in following
+# performance changes:
+#
+#		compared with original	compared with Intel cc
+#		assembler impl.		generated code
+# Pentium	-16%			+48%
+# PIII/AMD	+8%			+16%
+# P4		+85%(!)			+45%
+#
+# As you can see Pentium came out as looser:-( Yet I reckoned that
+# improvement on P4 outweights the loss and incorporate this
+# re-tuned code to 0.9.7 and later.
+# ----------------------------------------------------------------
+#					<appro@fy.chalmers.se>
+
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+#    it possible to reduce amount of instructions required to perform
+#    the operation in question, for further details see
+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86		SSSE3		AVX
+# Pentium	15.7		-
+# PIII		11.5		-
+# P4		10.6		-
+# AMD K8	7.1		-
+# Core2		7.3		6.0/+22%	-
+# Westmere	7.3		5.5/+33%	-
+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
+# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
+# Haswell	6.5		4.3/+51%	4.1(**)/+58%
+# Skylake	6.4		4.1/+55%	4.1(**)/+55%
+# Bulldozer	11.6		6.0/+92%
+# VIA Nano	10.6		7.5/+41%
+# Atom		12.5		9.3(*)/+35%
+# Silvermont	14.5		9.9(*)/+46%
+# Goldmont	8.8		6.7/+30%	1.7(***)/+415%
+#
+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
+#	The discrepancy is because of front-end limitations, so
+#	called MS-ROM penalties, and on Silvermont even rotate's
+#	limited parallelism.
+#
+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+#
+# (***)	SHAEXT result
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
+
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+$ymm = 1;
+
+$ymm = 0 unless ($xmm);
+
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
+# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
+# been tested.
+$shaext = 0;
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
+$A="eax";
+$B="ebx";
+$C="ecx";
+$D="edx";
+$E="edi";
+$T="esi";
+$tmp1="ebp";
+
+@V=($A,$B,$C,$D,$E,$T);
+
+$alt=0;	# 1 denotes alternative IALU implementation, which performs
+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
+	# Sandy Bridge...
+
+sub BODY_00_15
+	{
+	local($n,$a,$b,$c,$d,$e,$f)=@_;
+
+	&comment("00_15 $n");
+
+	&mov($f,$c);			# f to hold F_00_19(b,c,d)
+	 if ($n==0)  { &mov($tmp1,$a); }
+	 else        { &mov($a,$tmp1); }
+	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
+	 &xor($f,$d);
+	&add($tmp1,$e);			# tmp1+=e;
+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
+	 				# with xi, also note that e becomes
+					# f in next round...
+	&and($f,$b);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+
+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+		      &add($f,$tmp1); }	# f+=tmp1
+	else        { &add($tmp1,$f); }	# f becomes a in next round
+	&mov($tmp1,$a)			if ($alt && $n==15);
+	}
+
+sub BODY_16_19
+	{
+	local($n,$a,$b,$c,$d,$e,$f)=@_;
+
+	&comment("16_19 $n");
+
+if ($alt) {
+	&xor($c,$d);
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
+	&xor($c,$d);			# restore $c
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
+	 &mov($tmp1,$a);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
+}
+	}
+
+sub BODY_20_39
+	{
+	local($n,$a,$b,$c,$d,$e,$f)=@_;
+	local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
+
+	&comment("20_39 $n");
+
+if ($alt) {
+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+8)%16));
+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
+	&and($tmp1,$b)			if($n==39);
+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+	&rotr($a,5)			if ($n==79);
+} else {
+	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$c);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($tmp1,$a);
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+	&add($f,$tmp1);			# f+=ROTATE(a,5)
+}
+	}
+
+sub BODY_40_59
+	{
+	local($n,$a,$b,$c,$d,$e,$f)=@_;
+
+	&comment("40_59 $n");
+
+if ($alt) {
+	&add($e,$tmp1);			# e+=b&(c^d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&mov($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($c,$d);			# restore $c
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &and($tmp1,$c);
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &add($e,$tmp1);		# e+=c&d
+	&mov($tmp1,$a);			# b in next round
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if ($n<59);
+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($tmp1,$e);		# b&(c^d)+=e
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($e,$a);			# e becomes volatile
+	&rotl($e,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+	 &mov($tmp1,$c);
+	&add($f,$e);			# f+=ROTATE(a,5)
+	 &and($tmp1,$d);
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=c&d
+}
+	}
+
+&function_begin("sha1_block_data_order");
+if ($xmm) {
+  &static_label("shaext_shortcut")	if ($shaext);
+  &static_label("ssse3_shortcut");
+  &static_label("avx_shortcut")		if ($ymm);
+  &static_label("K_XX_XX");
+
+	&call	(&label("pic_point"));	# make it PIC!
+  &set_label("pic_point");
+	&blindpop($tmp1);
+	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+	&mov	($A,&DWP(0,$T));
+	&mov	($D,&DWP(4,$T));
+	&test	($D,1<<9);		# check SSSE3 bit
+	&jz	(&label("x86"));
+	&mov	($C,&DWP(8,$T));
+	&test	($A,1<<24);		# check FXSR bit
+	&jz	(&label("x86"));
+	if ($shaext) {
+		&test	($C,1<<29);		# check SHA bit
+		&jnz	(&label("shaext_shortcut"));
+	}
+	if ($ymm) {
+		&and	($D,1<<28);		# mask AVX bit
+		&and	($A,1<<30);		# mask "Intel CPU" bit
+		&or	($A,$D);
+		&cmp	($A,1<<28|1<<30);
+		&je	(&label("avx_shortcut"));
+	}
+	&jmp	(&label("ssse3_shortcut"));
+  &set_label("x86",16);
+}
+	&mov($tmp1,&wparam(0));	# SHA_CTX *c
+	&mov($T,&wparam(1));	# const void *input
+	&mov($A,&wparam(2));	# size_t num
+	&stack_push(16+3);	# allocate X[16]
+	&shl($A,6);
+	&add($A,$T);
+	&mov(&wparam(2),$A);	# pointer beyond the end of input
+	&mov($E,&DWP(16,$tmp1));# pre-load E
+	&jmp(&label("loop"));
+
+&set_label("loop",16);
+
+	# copy input chunk to X, but reversing byte order!
+	for ($i=0; $i<16; $i+=4)
+		{
+		&mov($A,&DWP(4*($i+0),$T));
+		&mov($B,&DWP(4*($i+1),$T));
+		&mov($C,&DWP(4*($i+2),$T));
+		&mov($D,&DWP(4*($i+3),$T));
+		&bswap($A);
+		&bswap($B);
+		&bswap($C);
+		&bswap($D);
+		&mov(&swtmp($i+0),$A);
+		&mov(&swtmp($i+1),$B);
+		&mov(&swtmp($i+2),$C);
+		&mov(&swtmp($i+3),$D);
+		}
+	&mov(&wparam(1),$T);	# redundant in 1st spin
+
+	&mov($A,&DWP(0,$tmp1));	# load SHA_CTX
+	&mov($B,&DWP(4,$tmp1));
+	&mov($C,&DWP(8,$tmp1));
+	&mov($D,&DWP(12,$tmp1));
+	# E is pre-loaded
+
+	for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+	for(;$i<20;$i++)	{ &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
+	for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+	for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+	for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
+
+	&mov($tmp1,&wparam(0));	# re-load SHA_CTX*
+	&mov($D,&wparam(1));	# D is last "T" and is discarded
+
+	&add($E,&DWP(0,$tmp1));	# E is last "A"...
+	&add($T,&DWP(4,$tmp1));
+	&add($A,&DWP(8,$tmp1));
+	&add($B,&DWP(12,$tmp1));
+	&add($C,&DWP(16,$tmp1));
+
+	&mov(&DWP(0,$tmp1),$E);	# update SHA_CTX
+	 &add($D,64);		# advance input pointer
+	&mov(&DWP(4,$tmp1),$T);
+	 &cmp($D,&wparam(2));	# have we reached the end yet?
+	&mov(&DWP(8,$tmp1),$A);
+	 &mov($E,$C);		# C is last "E" which needs to be "pre-loaded"
+	&mov(&DWP(12,$tmp1),$B);
+	 &mov($T,$D);		# input pointer
+	&mov(&DWP(16,$tmp1),$C);
+	&jb(&label("loop"));
+
+	&stack_pop(16+3);
+&function_end("sha1_block_data_order");
+
+if ($xmm) {
+if ($shaext) {
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("edi","esi","ecx");
+my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3));
+my @MSG=map("xmm$_",(4..7));
+
+sub sha1rnds4 {
+ my ($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm);	}
+}
+sub sha1op38 {
+ my ($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
+}
+sub sha1nexte	{ sha1op38(0xc8,@_); }
+sub sha1msg1	{ sha1op38(0xc9,@_); }
+sub sha1msg2	{ sha1op38(0xca,@_); }
+
+&function_begin("_sha1_block_data_order_shaext");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("shaext_shortcut");
+	&mov	($ctx,&wparam(0));
+	&mov	("ebx","esp");
+	&mov	($inp,&wparam(1));
+	&mov	($num,&wparam(2));
+	&sub	("esp",32);
+
+	&movdqu	($ABCD,&QWP(0,$ctx));
+	&movd	($E,&DWP(16,$ctx));
+	&and	("esp",-32);
+	&movdqa	($BSWAP,&QWP(0x50,$tmp1));	# byte-n-word swap
+
+	&movdqu	(@MSG[0],&QWP(0,$inp));
+	&pshufd	($ABCD,$ABCD,0b00011011);	# flip word order
+	&movdqu	(@MSG[1],&QWP(0x10,$inp));
+	&pshufd	($E,$E,0b00011011);		# flip word order
+	&movdqu	(@MSG[2],&QWP(0x20,$inp));
+	&pshufb	(@MSG[0],$BSWAP);
+	&movdqu	(@MSG[3],&QWP(0x30,$inp));
+	&pshufb	(@MSG[1],$BSWAP);
+	&pshufb	(@MSG[2],$BSWAP);
+	&pshufb	(@MSG[3],$BSWAP);
+	&jmp	(&label("loop_shaext"));
+
+&set_label("loop_shaext",16);
+	&dec		($num);
+	&lea		("eax",&DWP(0x40,$inp));
+	&movdqa		(&QWP(0,"esp"),$E);	# offload $E
+	&paddd		($E,@MSG[0]);
+	&cmovne		($inp,"eax");
+	&movdqa		(&QWP(16,"esp"),$ABCD);	# offload $ABCD
+
+for($i=0;$i<20-4;$i+=2) {
+	&sha1msg1	(@MSG[0],@MSG[1]);
+	&movdqa		($E_,$ABCD);
+	&sha1rnds4	($ABCD,$E,int($i/5));	# 0-3...
+	&sha1nexte	($E_,@MSG[1]);
+	&pxor		(@MSG[0],@MSG[2]);
+	&sha1msg1	(@MSG[1],@MSG[2]);
+	&sha1msg2	(@MSG[0],@MSG[3]);
+
+	&movdqa		($E,$ABCD);
+	&sha1rnds4	($ABCD,$E_,int(($i+1)/5));
+	&sha1nexte	($E,@MSG[2]);
+	&pxor		(@MSG[1],@MSG[3]);
+	&sha1msg2	(@MSG[1],@MSG[0]);
+
+	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
+}
+	&movdqu		(@MSG[0],&QWP(0,$inp));
+	&movdqa		($E_,$ABCD);
+	&sha1rnds4	($ABCD,$E,3);		# 64-67
+	&sha1nexte	($E_,@MSG[1]);
+	&movdqu		(@MSG[1],&QWP(0x10,$inp));
+	&pshufb		(@MSG[0],$BSWAP);
+
+	&movdqa		($E,$ABCD);
+	&sha1rnds4	($ABCD,$E_,3);		# 68-71
+	&sha1nexte	($E,@MSG[2]);
+	&movdqu		(@MSG[2],&QWP(0x20,$inp));
+	&pshufb		(@MSG[1],$BSWAP);
+
+	&movdqa		($E_,$ABCD);
+	&sha1rnds4	($ABCD,$E,3);		# 72-75
+	&sha1nexte	($E_,@MSG[3]);
+	&movdqu		(@MSG[3],&QWP(0x30,$inp));
+	&pshufb		(@MSG[2],$BSWAP);
+
+	&movdqa		($E,$ABCD);
+	&sha1rnds4	($ABCD,$E_,3);		# 76-79
+	&movdqa		($E_,&QWP(0,"esp"));
+	&pshufb		(@MSG[3],$BSWAP);
+	&sha1nexte	($E,$E_);
+	&paddd		($ABCD,&QWP(16,"esp"));
+
+	&jnz		(&label("loop_shaext"));
+
+	&pshufd	($ABCD,$ABCD,0b00011011);
+	&pshufd	($E,$E,0b00011011);
+	&movdqu	(&QWP(0,$ctx),$ABCD)
+	&movd	(&DWP(16,$ctx),$E);
+	&mov	("esp","ebx");
+&function_end("_sha1_block_data_order_shaext");
+}
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my $rx=0;
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&movdqa	(&QWP(112+16,"esp"),@X[5]);
+	&movdqa	(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&movdqa	(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&movdqa	(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&pshufb	(@X[-3&7],@X[2]);
+	&pshufb	(@X[-2&7],@X[2]);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&pshufb	(@X[-1&7],@X[2]);
+	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
+	&paddd	(@X[-3&7],@X[3]);
+	&paddd	(@X[-2&7],@X[3]);
+	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
+	&psubd	(@X[-4&7],@X[3]);		# restore X[]
+	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
+	&psubd	(@X[-3&7],@X[3]);
+	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
+	&mov	(@T[1],$C);
+	&psubd	(@X[-2&7],@X[3]);
+	&xor	(@T[1],$D);
+	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
+	&and	(@T[0],@T[1]);
+	&jmp	(&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+	&movdqa	(@X[2],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@X[3],@X[-1&7]);
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+
+	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@X[4],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	&movdqa (@X[2],@X[0]);
+	 eval(shift(@insns));
+
+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[2],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	&movdqa	(@X[3],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[4],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@X[3],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	&pxor   (@X[0],@X[4]);
+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
+	  &pshufd	(@X[1],@X[-3&7],0xee)	if ($Xi<7);	# was &movdqa	(@X[1],@X[-2&7])
+	  &pshufd	(@X[3],@X[-1&7],0xee)	if ($Xi==7);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&punpcklqdq(@X[2],@X[-1&7]);	# compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
+	 if ($Xi%5) {
+	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	 eval(shift(@insns));		# ror
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@X[2],@X[0]);
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@X[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
+	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
+
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	  &pshufd	(@X[3],@X[-1],0xee)	if ($Xi<19);	# was &movdqa	(@X[3],@X[0])
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
+	&movdqu	(@X[-3&7],&QWP(16,$inp));
+	&movdqu	(@X[-2&7],&QWP(32,$inp));
+	&movdqu	(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@X[3]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {	# ((c^d)&b)^d
+	# on start @T[0]=(c^d)&b
+	return &body_20_39()	if ($rx==19);	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&xor	(@T[0],$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	($b,$c);',	# $c^$d for next round
+
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&and	(@T[1],$b);',	# ($b&($c^$d)) for next round
+
+	'&xor	($b,$c);',	# restore $b
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {	# b^d^c
+	# on entry @T[0]=b^d
+	return &body_40_59()	if ($rx==39);	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	(@T[0],$d)	if($j==19);'.
+	'&xor	(@T[0],$c)	if($j> 19);',	# ($b^$d^$c)
+	'&mov	(@T[1],$a);',	# $b in next round
+
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	(@T[1],$c)	if ($j< 79);',	# $b^$d for next round
+
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {	# ((b^c)&(c^d))^c
+	# on entry @T[0]=(b^c), (c^=d)
+	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&and	(@T[0],$c)	if ($j>=40);',	# (b^c)&(c^d)
+	'&xor	($c,$d)		if ($j>=40);',	# restore $c
+
+	'&$_ror	($b,7);',	# $b>>>2
+	'&mov	(@T[1],$a);',	# $b for next round
+	'&xor	(@T[0],$c);',
+
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	(@T[1],$c)	if ($j==59);'.
+	'&xor	(@T[1],$b)	if ($j< 59);',	# b^c for next round
+
+	'&xor	($b,$c)		if ($j< 59);',	# c^d for next round
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+######
+sub bodyx_00_19 () {	# ((c^d)&b)^d
+	# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
+	return &bodyx_20_39()	if ($rx==19);	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+
+	'&rorx	($b,$b,2)			if ($j==0);'.	# $b>>>2
+	'&rorx	($b,@T[1],7)			if ($j!=0);',	# $b>>>2
+	'&lea	($e,&DWP(0,$e,@T[0]));',
+	'&rorx	(@T[0],$a,5);',
+
+	'&andn	(@T[1],$a,$c);',
+	'&and	($a,$b)',
+	'&add	($d,&DWP(4*(($j+1)&15),"esp"));',	# X[]+K xfer
+
+	'&xor	(@T[1],$a)',
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub bodyx_20_39 () {	# b^d^c
+	# on start $b=b^c^d
+	return &bodyx_40_59()	if ($rx==39);	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+
+	'&add	($e,($j==19?@T[0]:$b))',
+	'&rorx	($b,@T[1],7);',	# $b>>>2
+	'&rorx	(@T[0],$a,5);',
+
+	'&xor	($a,$b)				if ($j<79);',
+	'&add	($d,&DWP(4*(($j+1)&15),"esp"))	if ($j<79);',	# X[]+K xfer
+	'&xor	($a,$c)				if ($j<79);',
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub bodyx_40_59 () {	# ((b^c)&(c^d))^c
+	# on start $b=((b^c)&(c^d))^c
+	return &bodyx_20_39()	if ($rx==59);	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+
+	'&rorx	(@T[0],$a,5)',
+	'&lea	($e,&DWP(0,$e,$b))',
+	'&rorx	($b,@T[1],7)',	# $b>>>2
+	'&add	($d,&DWP(4*(($j+1)&15),"esp"))',	# X[]+K xfer
+
+	'&mov	(@T[1],$c)',
+	'&xor	($a,$b)',	# b^c for next round
+	'&xor	(@T[1],$b)',	# c^d for next round
+
+	'&and	($a,@T[1])',
+	'&add	($e,@T[0])',
+	'&xor	($a,$b)'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+&set_label("loop",16);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&xor	($B,$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&mov	(@T[1],@T[0]);
+	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
+	&and	(@T[0],$B);
+	&mov	($B,$T[1]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+$rx=0;	# reset
+
+if ($ymm) {
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+	&vzeroall();
+
+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
+	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
+	&vpaddd	(@X[1],@X[-3&7],@X[3]);
+	&vpaddd	(@X[2],@X[-2&7],@X[3]);
+	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
+	&mov	(@T[1],$C);
+	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+	&xor	(@T[1],$D);
+	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+	&and	(@T[0],@T[1]);
+	&jmp	(&label("loop"));
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[2],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[3],@X[4],30);
+	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@X[4],@X[4],2);
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@X[2],@X[0],30);
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	($B,$C);
+	&mov	(&DWP(8,@T[1]),$C);
+	&xor	($B,$D);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&mov	(@T[1],@T[0]);
+	&and	(@T[0],$B);
+	&mov	($B,@T[1]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+	&vzeroall();
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
+&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0);
+}
+&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
new file mode 100644
index 0000000..944248f
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl

@@ -0,0 +1,701 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# sha1_block procedure for ARMv4.
+#
+# January 2007.
+
+# Size/performance trade-off
+# ====================================================================
+# impl		size in bytes	comp cycles[*]	measured performance
+# ====================================================================
+# thumb		304		3212		4420
+# armv4-small	392/+29%	1958/+64%	2250/+96%
+# armv4-compact	740/+89%	1552/+26%	1840/+22%
+# armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
+# full unroll	~5100/+260%	~1260/+4%	~1300/+5%
+# ====================================================================
+# thumb		= same as 'small' but in Thumb instructions[**] and
+#		  with recurring code in two private functions;
+# small		= detached Xload/update, loops are folded;
+# compact	= detached Xload/update, 5x unroll;
+# large		= interleaved Xload/update, 5x unroll;
+# full unroll	= interleaved Xload/update, full unroll, estimated[!];
+#
+# [*]	Manually counted instructions in "grand" loop body. Measured
+#	performance is affected by prologue and epilogue overhead,
+#	i-cache availability, branch penalties, etc.
+# [**]	While each Thumb instruction is twice smaller, they are not as
+#	diverse as ARM ones: e.g., there are only two arithmetic
+#	instructions with 3 arguments, no [fixed] rotate, addressing
+#	modes are limited. As result it takes more instructions to do
+#	the same job in Thumb, therefore the code is never twice as
+#	small and always slower.
+# [***]	which is also ~35% better than compiler generated code. Dual-
+#	issue Cortex A8 core was measured to process input block in
+#	~990 cycles.
+
+# August 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 13% improvement on
+# Cortex A8 core and in absolute terms ~870 cycles per input block
+# [or 13.6 cycles per byte].
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
+
+# September 2013.
+#
+# Add NEON implementation (see sha1-586.pl for background info). On
+# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
+# faster than integer-only code. Because [fully unrolled] NEON code
+# is ~2.5x larger and there are some redundant instructions executed
+# when processing last block, improvement is not as big for smallest
+# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$ctx="r0";
+$inp="r1";
+$len="r2";
+$a="r3";
+$b="r4";
+$c="r5";
+$d="r6";
+$e="r7";
+$K="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+$Xi="r14";
+@V=($a,$b,$c,$d,$e);
+
+sub Xupdate {
+my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
+$code.=<<___;
+	ldr	$t0,[$Xi,#15*4]
+	ldr	$t1,[$Xi,#13*4]
+	ldr	$t2,[$Xi,#7*4]
+	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
+	ldr	$t3,[$Xi,#2*4]
+	eor	$t0,$t0,$t1
+	eor	$t2,$t2,$t3			@ 1 cycle stall
+	eor	$t1,$c,$d			@ F_xx_xx
+	mov	$t0,$t0,ror#31
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
+	$opt1					@ F_xx_xx
+	$opt2					@ F_xx_xx
+	add	$e,$e,$t0			@ E+=X[i]
+___
+}
+
+sub BODY_00_15 {
+my ($a,$b,$c,$d,$e)=@_;
+$code.=<<___;
+#if __ARM_ARCH__<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
+	eor	$t1,$c,$d			@ F_xx_xx
+	orr	$t0,$t0,$t2,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
+	eor	$t1,$c,$d			@ F_xx_xx
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
+	and	$t1,$b,$t1,ror#2
+	add	$e,$e,$t0			@ E+=X[i]
+	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
+	str	$t0,[$Xi,#-4]!
+	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_16_19 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"and $t1,$b,$t1,ror#2");
+$code.=<<___;
+	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
+	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
+$code.=<<___;
+	add	$e,$e,$t1			@ E+=F_20_39(B,C,D)
+___
+}
+
+sub BODY_40_59 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
+$code.=<<___;
+	add	$e,$e,$t1			@ E+=F_40_59(B,C,D)
+	add	$e,$e,$t2,ror#2
+___
+}
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+.text
+.code	32
+
+.global	sha1_block_data_order
+.type	sha1_block_data_order,%function
+
+.align	5
+sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+	sub	r3,pc,#8		@ sha1_block_data_order
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA1
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	stmdb	sp!,{r4-r12,lr}
+	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
+	ldmia	$ctx,{$a,$b,$c,$d,$e}
+.Lloop:
+	ldr	$K,.LK_00_19
+	mov	$Xi,sp
+	sub	sp,sp,#15*4
+	mov	$c,$c,ror#30
+	mov	$d,$d,ror#30
+	mov	$e,$e,ror#30		@ [6]
+.L_00_15:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_00_15(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+	teq	$Xi,sp
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+___
+	&BODY_00_15(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+$code.=<<___;
+
+	ldr	$K,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_20_39(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+	teq	$Xi,sp			@ preserve carry
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	$K,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_40_59(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+	teq	$Xi,sp
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	$K,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	$ctx,{$K,$t0,$t1,$t2,$t3}
+	add	$a,$K,$a
+	add	$b,$t0,$b
+	add	$c,$t1,$c,ror#2
+	add	$d,$t2,$d,ror#2
+	add	$e,$t3,$e,ror#2
+	stmia	$ctx,{$a,$b,$c,$d,$e}
+	teq	$inp,$len
+	bne	.Lloop			@ [+18], total 1307
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha1_block_data_order,.-sha1_block_data_order
+
+.align	5
+.LK_00_19:	.word	0x5a827999
+.LK_20_39:	.word	0x6ed9eba1
+.LK_40_59:	.word	0x8f1bbcdc
+.LK_60_79:	.word	0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha1_block_data_order
+#endif
+.asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+___
+#####################################################################
+# NEON stuff
+#
+{{{
+my @V=($a,$b,$c,$d,$e);
+my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
+my $Xi=4;
+my @X=map("q$_",(8..11,0..3));
+my @Tx=("q12","q13");
+my ($K,$zero)=("q14","q15");
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&bic	($t0,$d,$b)',
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&and	($t1,$c,$b)',
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&eor	($t1,$t1,$t0)',		# F_00_19
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_00_19
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&eor	($t0,$b,$d)',
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
+	'&eor	($t1,$t0,$c)',		# F_20_39
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_20_39
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&and	($t0,$c,$d)',
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&eor	($t1,$c,$d)',
+	'&add	($e,$e,$t0)',
+	'&and	($t1,$t1,$b)',
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_40_59
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+
+sub Xupdate_16_31 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vext_8		(@X[0],@X[-4&7],@X[-3&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
+	 eval(shift(@insns));
+	&vext_8		(@Tx[0],@X[-1&7],$zero,4);	# "X[-3]", 3 words
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[0]"^="X[-3]"^"X[-8]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
+	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		(@Tx[1],$zero,@Tx[0],4);	# "X[0]"<<96, extract one dword
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@Tx[0],@Tx[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsri_32	(@X[0],@Tx[0],31);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	(@Tx[0],@Tx[1],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshl_u32	(@Tx[1],@Tx[1],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@Tx[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_32_79 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vext_8		(@Tx[0],@X[-2&7],@X[-1&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[-6]"^="X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	(@X[0],@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
+	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	(@X[0],@Tx[0],2);		# "X[0]"="X[-6]"<<<2
+
+	foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_80 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vst1_32	("{@Tx[1]}","[$Xfer,:128]!");
+	&sub		($Xfer,$Xfer,64);
+
+	&teq		($inp,$len);
+	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX
+	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV
+	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_8		("{@X[-2&7]-@X[-1&7]}","[$inp]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$K\[]}","[$K_XX_XX,:32]!");	# load K_00_19
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[-4&7],@X[-4&7]);
+
+	foreach (@insns) { eval; }		# remaining instructions
+
+   $Xi=0;
+}
+
+sub Xloop()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vrev32_8	(@X[($Xi-3)&7],@X[($Xi-3)&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[$Xi&7],@X[($Xi-4)&7],$K);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vst1_32	("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
+
+	foreach (@insns) { eval; }
+
+  $Xi++;
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	sha1_block_data_order_neon,%function
+.align	4
+sha1_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
+	@ dmb				@ errata #451034 on early Cortex A8
+	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
+	mov	$saved_sp,sp
+	sub	sp,sp,#64		@ alloca
+	adr	$K_XX_XX,.LK_00_19
+	bic	sp,sp,#15		@ align for 128-bit stores
+
+	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context
+	mov	$Xfer,sp
+
+	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned
+	veor		$zero,$zero,$zero
+	vld1.8		{@X[-2&7]-@X[-1&7]},[$inp]!
+	vld1.32		{${K}\[]},[$K_XX_XX,:32]!	@ load K_00_19
+	vrev32.8	@X[-4&7],@X[-4&7]		@ yes, even on
+	vrev32.8	@X[-3&7],@X[-3&7]		@ big-endian...
+	vrev32.8	@X[-2&7],@X[-2&7]
+	vadd.i32	@X[0],@X[-4&7],$K
+	vrev32.8	@X[-1&7],@X[-1&7]
+	vadd.i32	@X[1],@X[-3&7],$K
+	vst1.32		{@X[0]},[$Xfer,:128]!
+	vadd.i32	@X[2],@X[-2&7],$K
+	vst1.32		{@X[1]},[$Xfer,:128]!
+	vst1.32		{@X[2]},[$Xfer,:128]!
+	ldr		$Ki,[sp]			@ big RAW stall
+
+.Loop_neon:
+___
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_32_79(\&body_00_19);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_20_39);
+	&Xuplast_80(\&body_20_39);
+	&Xloop(\&body_20_39);
+	&Xloop(\&body_20_39);
+	&Xloop(\&body_20_39);
+$code.=<<___;
+	ldmia	$ctx,{$Ki,$t0,$t1,$Xfer}	@ accumulate context
+	add	$a,$a,$Ki
+	ldr	$Ki,[$ctx,#16]
+	add	$b,$b,$t0
+	add	$c,$c,$t1
+	add	$d,$d,$Xfer
+	moveq	sp,$saved_sp
+	add	$e,$e,$Ki
+	ldrne	$Ki,[sp]
+	stmia	$ctx,{$a,$b,$c,$d,$e}
+	addne	$Xfer,sp,#3*16
+	bne	.Loop_neon
+
+	@ vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r12,pc}
+.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+___
+}}}
+#####################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
+my @MSG=map("q$_",(4..7));
+my @Kxx=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.type	sha1_block_data_order_armv8,%function
+.align	5
+sha1_block_data_order_armv8:
+.LARMv8:
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	veor	$E,$E,$E
+	adr	r3,.LK_00_19
+	vld1.32	{$ABCD},[$ctx]!
+	vld1.32	{$E\[0]},[$ctx]
+	sub	$ctx,$ctx,#16
+	vld1.32	{@Kxx[0]\[]},[r3,:32]!
+	vld1.32	{@Kxx[1]\[]},[r3,:32]!
+	vld1.32	{@Kxx[2]\[]},[r3,:32]!
+	vld1.32	{@Kxx[3]\[]},[r3,:32]
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+
+	vadd.i32	$W0,@Kxx[0],@MSG[0]
+	vrev32.8	@MSG[2],@MSG[2]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	subs		$len,$len,#1
+
+	vadd.i32	$W1,@Kxx[0],@MSG[1]
+	vrev32.8	@MSG[3],@MSG[3]
+	sha1h		$E1,$ABCD		@ 0
+	sha1c		$ABCD,$E,$W0
+	vadd.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0		@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h		$E0,$ABCD		@ $i
+	sha1$f		$ABCD,$E1,$W1
+	vadd.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1		@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0		@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);	($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));	$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h		$E0,$ABCD		@ $i
+	sha1p		$ABCD,$E1,$W1
+	vadd.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h		$E1,$ABCD		@ 18
+	sha1p		$ABCD,$E0,$W0
+
+	sha1h		$E0,$ABCD		@ 19
+	sha1p		$ABCD,$E1,$W1
+
+	vadd.i32	$E,$E,$E0
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD},[$ctx]!
+	vst1.32		{$E\[0]},[$ctx]
+
+	vldmia	sp!,{d8-d15}
+	ret					@ bx lr
+.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+___
+
+{   my  %opcode = (
+	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
+	"sha1m"		=> 0xf2200c40,	"sha1su0"	=> 0xf2300c40,
+	"sha1h"		=> 0xf3b902c0,	"sha1su1"	=> 0xf3ba0380	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+	s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo	or
+	s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
+
+	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
+
+	s/\bret\b/bx	lr/o		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4
+
+	print $_,$/;
+}
+
+close STDOUT; # enforce flush

diff --git a/crypto/fipsmodule/sha/asm/sha1-armv8.pl b/crypto/fipsmodule/sha/asm/sha1-armv8.pl
new file mode 100644
index 0000000..dbbb54b
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha1-armv8.pl

@@ -0,0 +1,347 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		hardware-assisted	software(*)
+# Apple A7	2.31			4.13 (+14%)
+# Cortex-A53	2.24			8.03 (+97%)
+# Cortex-A57	2.35			7.88 (+74%)
+# Denver	2.13			3.97 (+0%)(**)
+# X-Gene				8.80 (+200%)
+#
+# (*)	Software results are presented mostly for reference purposes.
+# (**)	Keep in mind that Denver relies on binary translation, which
+#	optimizes compiler output at run-time.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+	lsr	@Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef	__ARMEB__
+	ror	@Xx[$i+1],@Xx[$i+1],#32
+#else
+	rev32	@Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+	movz	$K,#0xeba1
+	movk	$K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+	movz	$K,#0xc1d6
+	movk	$K,#0xca62,lsl#16
+___
+$code.=<<___;
+	orr	$t0,$b,$c
+	and	$t1,$b,$c
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	ror	$t2,$a,#27
+	and	$t0,$t0,$d
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$e,$e,$t2		// e+=rot(a,5)
+	orr	$t0,$t0,$t1
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+	movz	$K,#0xbcdc
+	movk	$K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+	ldp	@Xw[1],@Xw[2],[$ctx]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+	ldp	@Xw[3],@Xw[4],[$ctx,#8]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	ldr	@Xw[5],[$ctx,#16]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include <openssl/arm_arch.h>
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,%function
+.align	6
+sha1_block_data_order:
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA1
+	b.ne	.Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldr	$E,[$ctx,#16]
+
+.Loop:
+	ldr	@Xx[0],[$inp],#64
+	movz	$K,#0x7999
+	sub	$num,$num,#1
+	movk	$K,#0x5a82,lsl#16
+#ifdef	__ARMEB__
+	ror	$Xx[0],@Xx[0],#32
+#else
+	rev32	@Xx[0],@Xx[0]
+#endif
+	add	$E,$E,$K		// warm it up
+	add	$E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	add	$B,$B,@Xw[2]
+	add	$C,$C,@Xw[3]
+	add	$A,$A,@Xw[1]
+	add	$D,$D,@Xw[4]
+	add	$E,$E,@Xw[5]
+	stp	$A,$B,[$ctx]
+	stp	$C,$D,[$ctx,#8]
+	str	$E,[$ctx,#16]
+	cbnz	$num,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.type	sha1_block_armv8,%function
+.align	6
+sha1_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adr	x4,.Lconst
+	eor	$E,$E,$E
+	ld1.32	{$ABCD},[$ctx],#16
+	ld1.32	{$E}[0],[$ctx]
+	sub	$ctx,$ctx,#16
+	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
+	sub	$num,$num,#1
+	rev32	@MSG[0],@MSG[0]
+	rev32	@MSG[1],@MSG[1]
+
+	add.i32	$W0,@Kxx[0],@MSG[0]
+	rev32	@MSG[2],@MSG[2]
+	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
+
+	add.i32	$W1,@Kxx[0],@MSG[1]
+	rev32	@MSG[3],@MSG[3]
+	sha1h	$E1,$ABCD
+	sha1c	$ABCD,$E,$W0		// 0
+	add.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0	@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1$f	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1	@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0	@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1p	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h	$E1,$ABCD		// 18
+	sha1p	$ABCD,$E0,$W0
+
+	sha1h	$E0,$ABCD		// 19
+	sha1p	$ABCD,$E1,$W1
+
+	add.i32	$E,$E,$E0
+	add.i32	$ABCD,$ABCD,$ABCD_SAVE
+
+	cbnz	$num,.Loop_hw
+
+	st1.32	{$ABCD},[$ctx],#16
+	st1.32	{$E}[0],[$ctx]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_armv8,.-sha1_block_armv8
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.LOPENSSL_armcap_P:
+.quad	OPENSSL_armcap_P-.
+.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+.comm	OPENSSL_armcap_P,4,4
+___
+}}}
+
+{   my	%opcode = (
+	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
+	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
+	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
new file mode 100755
index 0000000..25a1cad
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl

@@ -0,0 +1,2057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# sha1_block procedure for x86_64.
+#
+# It was brought to my attention that on EM64T compiler-generated code
+# was far behind 32-bit assembler implementation. This is unlike on
+# Opteron where compiler-generated code was only 15% behind 32-bit
+# assembler, which originally made it hard to motivate the effort.
+# There was suggestion to mechanically translate 32-bit code, but I
+# dismissed it, reasoning that x86_64 offers enough register bank
+# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+# implementation:-) However! While 64-bit code does perform better
+# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+# x86_64 does offer larger *addressable* bank, but out-of-order core
+# reaches for even more registers through dynamic aliasing, and EM64T
+# core must have managed to run-time optimize even 32-bit code just as
+# good as 64-bit one. Performance improvement is summarized in the
+# following table:
+#
+#		gcc 3.4		32-bit asm	cycles/byte
+# Opteron	+45%		+20%		6.8
+# Xeon P4	+65%		+0%		9.9
+# Core2		+60%		+10%		7.0
+
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+# May 2013.
+#
+# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
+# and loading pair of consecutive blocks to 256-bit %ymm registers)
+# did not provide impressive performance improvement till a crucial
+# hint regarding the number of Xupdate iterations to pre-compute in
+# advance was provided by Ilya Albrekht of Intel Corp.
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86_64		SSSE3		AVX[2]
+# P4		9.05		-
+# Opteron	6.26		-
+# Core2		6.55		6.05/+8%	-
+# Westmere	6.73		5.30/+27%	-
+# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
+# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
+# Haswell	5.45		4.15/+31%	3.57/+53%
+# Skylake	5.18		4.06/+28%	3.54/+46%
+# Bulldozer	9.11		5.95/+53%
+# VIA Nano	9.32		7.15/+30%
+# Atom		10.3		9.17/+12%
+# Silvermont	13.1(*)		9.37/+40%
+# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
+#
+# (*)	obviously suboptimal result, nothing was done about it,
+#	because SSSE3 code is compiled unconditionally;
+# (**)	SHAEXT result
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+#
+# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
+# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
+# did not tie them together until after $shaext was added.
+$avx = 1;
+
+# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
+# been tested.
+$shaext=0;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$ctx="%rdi";	# 1st arg
+$inp="%rsi";	# 2nd arg
+$num="%rdx";	# 3rd arg
+
+# reassign arguments in order to produce more compact code
+$ctx="%r8";
+$inp="%r9";
+$num="%r10";
+
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp","%r14d");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
+
+@V=($A,$B,$C,$D,$E);
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	mov	`4*$i`($inp),$xi[0]
+	bswap	$xi[0]
+___
+$code.=<<___ if ($i<15);
+	mov	`4*$j`($inp),$xi[1]
+	mov	$d,$t0
+	mov	$xi[0],`4*$i`(%rsp)
+	mov	$a,$t2
+	bswap	$xi[1]
+	xor	$c,$t0
+	rol	\$5,$t2
+	and	$b,$t0
+	lea	0x5a827999($xi[0],$e),$e
+	add	$t2,$e
+	xor	$d,$t0
+	rol	\$30,$b
+	add	$t0,$e
+___
+$code.=<<___ if ($i>=15);
+	xor	`4*($j%16)`(%rsp),$xi[1]
+	mov	$d,$t0
+	mov	$xi[0],`4*($i%16)`(%rsp)
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	xor	$c,$t0
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	and	$b,$t0
+	lea	0x5a827999($xi[0],$e),$e
+	rol	\$30,$b
+	xor	$d,$t0
+	add	$t2,$e
+	rol	\$1,$xi[1]
+	add	$t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+$code.=<<___ if ($i<79);
+	xor	`4*($j%16)`(%rsp),$xi[1]
+	mov	$b,$t0
+	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	xor	$d,$t0
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	lea	$K($xi[0],$e),$e
+	xor	$c,$t0
+	add	$t2,$e
+	rol	\$30,$b
+	add	$t0,$e
+	rol	\$1,$xi[1]
+___
+$code.=<<___ if ($i==79);
+	mov	$b,$t0
+	mov	$a,$t2
+	xor	$d,$t0
+	lea	$K($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	$c,$t0
+	add	$t2,$e
+	rol	\$30,$b
+	add	$t0,$e
+___
+push(@xi,shift(@xi));
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+	xor	`4*($j%16)`(%rsp),$xi[1]
+	mov	$d,$t0
+	mov	$xi[0],`4*($i%16)`(%rsp)
+	mov	$d,$t1
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	and	$c,$t0
+	mov	$a,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	lea	0x8f1bbcdc($xi[0],$e),$e
+	xor	$c,$t1
+	rol	\$5,$t2
+	add	$t0,$e
+	rol	\$1,$xi[1]
+	and	$b,$t1
+	add	$t2,$e
+	rol	\$30,$b
+	add	$t1,$e
+___
+push(@xi,shift(@xi));
+}
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_addr
+
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,\@function,3
+.align	16
+sha1_block_data_order:
+	lea	OPENSSL_ia32cap_addr(%rip),%r10
+	mov	(%r10),%r10
+	mov	0(%r10),%r9d
+	mov	4(%r10),%r8d
+	mov	8(%r10),%r10d
+	test	\$`1<<9`,%r8d		# check SSSE3 bit
+	jz	.Lialu
+___
+$code.=<<___ if ($shaext);
+	test	\$`1<<29`,%r10d		# check SHA bit
+	jnz	_shaext_shortcut
+___
+$code.=<<___ if ($avx>1);
+	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
+	cmp	\$`1<<3|1<<5|1<<8`,%r10d
+	je	_avx2_shortcut
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r8d		# mask AVX bit
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	or	%r9d,%r8d
+	cmp	\$`1<<28|1<<30`,%r8d
+	je	_avx_shortcut
+___
+$code.=<<___;
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rdi,$ctx	# reassigned argument
+	sub	\$`8+16*4`,%rsp
+	mov	%rsi,$inp	# reassigned argument
+	and	\$-64,%rsp
+	mov	%rdx,$num	# reassigned argument
+	mov	%rax,`16*4`(%rsp)
+.Lprologue:
+
+	mov	0($ctx),$A
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	16($ctx),$E
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+___
+for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	add	0($ctx),$A
+	add	4($ctx),$B
+	add	8($ctx),$C
+	add	12($ctx),$D
+	add	16($ctx),$E
+	mov	$A,0($ctx)
+	mov	$B,4($ctx)
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+
+	sub	\$1,$num
+	lea	`16*4`($inp),$inp
+	jnz	.Lloop
+
+	mov	`16*4`(%rsp),%rsi
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
+___
+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type	sha1_block_data_order_shaext,\@function,3
+.align	32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+	lea	`-8-4*16`(%rsp),%rsp
+	movaps	%xmm6,-8-4*16(%rax)
+	movaps	%xmm7,-8-3*16(%rax)
+	movaps	%xmm8,-8-2*16(%rax)
+	movaps	%xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+	movdqu	($ctx),$ABCD
+	movd	16($ctx),$E
+	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
+
+	movdqu	($inp),@MSG[0]
+	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
+	movdqu	0x10($inp),@MSG[1]
+	pshufd	\$0b00011011,$E,$E		# flip word order
+	movdqu	0x20($inp),@MSG[2]
+	pshufb	$BSWAP,@MSG[0]
+	movdqu	0x30($inp),@MSG[3]
+	pshufb	$BSWAP,@MSG[1]
+	pshufb	$BSWAP,@MSG[2]
+	movdqa	$E,$E_SAVE			# offload $E
+	pshufb	$BSWAP,@MSG[3]
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	dec		$num
+	lea		0x40($inp),%r8		# next input block
+	paddd		@MSG[0],$E
+	cmovne		%r8,$inp
+	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+	sha1msg1	@MSG[1],@MSG[0]
+	movdqa		$ABCD,$E_
+	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
+	sha1nexte	@MSG[1],$E_
+	pxor		@MSG[2],@MSG[0]
+	sha1msg1	@MSG[2],@MSG[1]
+	sha1msg2	@MSG[3],@MSG[0]
+
+	movdqa		$ABCD,$E
+	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
+	sha1nexte	@MSG[2],$E
+	pxor		@MSG[3],@MSG[1]
+	sha1msg2	@MSG[0],@MSG[1]
+___
+	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqu		($inp),@MSG[0]
+	movdqa		$ABCD,$E_
+	sha1rnds4	\$3,$E,$ABCD		# 64-67
+	sha1nexte	@MSG[1],$E_
+	movdqu		0x10($inp),@MSG[1]
+	pshufb		$BSWAP,@MSG[0]
+
+	movdqa		$ABCD,$E
+	sha1rnds4	\$3,$E_,$ABCD		# 68-71
+	sha1nexte	@MSG[2],$E
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$BSWAP,@MSG[1]
+
+	movdqa		$ABCD,$E_
+	sha1rnds4	\$3,$E,$ABCD		# 72-75
+	sha1nexte	@MSG[3],$E_
+	movdqu		0x30($inp),@MSG[3]
+	pshufb		$BSWAP,@MSG[2]
+
+	movdqa		$ABCD,$E
+	sha1rnds4	\$3,$E_,$ABCD		# 76-79
+	sha1nexte	$E_SAVE,$E
+	pshufb		$BSWAP,@MSG[3]
+
+	paddd		$ABCD_SAVE,$ABCD
+	movdqa		$E,$E_SAVE		# offload $E
+
+	jnz		.Loop_shaext
+
+	pshufd	\$0b00011011,$ABCD,$ABCD
+	pshufd	\$0b00011011,$E,$E
+	movdqu	$ABCD,($ctx)
+	movd	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-8-4*16(%rax),%xmm6
+	movaps	-8-3*16(%rax),%xmm7
+	movaps	-8-2*16(%rax),%xmm8
+	movaps	-8-1*16(%rax),%xmm9
+	mov	%rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+	ret
+.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my $Kx="%xmm11";
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $rx=0;
+my $K_XX_XX="%r14";
+my $fp="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+{ my $sn;
+sub align32() {
+  ++$sn;
+$code.=<<___;
+	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
+.align	32
+.Lalign32_$sn:
+___
+}
+}
+
+$code.=<<___;
+.type	sha1_block_data_order_ssse3,\@function,3
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	mov	%rsp,$fp	# frame pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13		# redundant, done to share Win64 SE handler
+	push	%r14
+	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,-40-6*16($fp)
+	movaps	%xmm7,-40-5*16($fp)
+	movaps	%xmm8,-40-4*16($fp)
+	movaps	%xmm9,-40-3*16($fp)
+	movaps	%xmm10,-40-2*16($fp)
+	movaps	%xmm11,-40-1*16($fp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	and	\$-64,%rsp
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX+64(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+	mov	$C,@T[1]
+	xor	$D,@T[1]
+	and	@T[1],@T[0]
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	add	\$64,$inp
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	jmp	.Loop_ssse3
+___
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));		# ror
+	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns))		if ($Xi==8);
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns))		if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
+	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
+	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
+	}
+	 eval(shift(@insns));		# ror
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
+
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# body_20_39
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
+	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
+	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {	# ((c^d)&b)^d
+	# on start @T[0]=(c^d)&b
+	return &body_20_39() if ($rx==19); $rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&$_ror	($b,$j?7:2)',	# $b>>>2
+	'&xor	(@T[0],$d)',
+	'&mov	(@T[1],$a)',	# $b for next round
+
+	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
+	'&xor	($b,$c)',	# $c^$d for next round
+
+	'&$_rol	($a,5)',
+	'&add	($e,@T[0])',
+	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
+
+	'&xor	($b,$c)',	# restore $b
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {	# b^d^c
+	# on entry @T[0]=b^d
+	return &body_40_59() if ($rx==39); $rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
+	'&xor	(@T[0],$d)	if($j==19);'.
+	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
+	'&mov	(@T[1],$a)',	# $b for next round
+
+	'&$_rol	($a,5)',
+	'&add	($e,@T[0])',
+	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
+
+	'&$_ror	($b,7)',	# $b>>>2
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {	# ((b^c)&(c^d))^c
+	# on entry @T[0]=(b^c), (c^=d)
+	$rx++;
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
+	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
+	'&xor	($c,$d)		if ($j>=40)',	# restore $c
+
+	'&$_ror	($b,7)',	# $b>>>2
+	'&mov	(@T[1],$a)',	# $b for next round
+	'&xor	(@T[0],$c)',
+
+	'&$_rol	($a,5)',
+	'&add	($e,@T[0])',
+	'&xor	(@T[1],$c)	if ($j==59);'.
+	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
+
+	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
+	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$C,@T[1]
+	mov	$D,12($ctx)
+	xor	$D,@T[1]
+	mov	$E,16($ctx)
+	and	@T[1],@T[0]
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
+___
+$code.=<<___;
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+$Xi=4;				# reset variables
+@X=map("%xmm$_",(4..7,0..3));
+@Tx=map("%xmm$_",(8..10));
+$j=0;
+$rx=0;
+
+my $done_avx_label=".Ldone_avx";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_avx,\@function,3
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	mov	%rsp,$fp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13		# redundant, done to share Win64 SE handler
+	push	%r14
+	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	vmovaps	%xmm6,-40-6*16($fp)
+	vmovaps	%xmm7,-40-5*16($fp)
+	vmovaps	%xmm8,-40-4*16($fp)
+	vmovaps	%xmm9,-40-3*16($fp)
+	vmovaps	%xmm10,-40-2*16($fp)
+	vmovaps	%xmm11,-40-1*16($fp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	and	\$-64,%rsp
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX+64(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+	mov	$C,@T[1]
+	xor	$D,@T[1]
+	and	@T[1],@T[0]
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	$Kx,@X[-3&7],@X[1]
+	vpaddd	$Kx,@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	jmp	.Loop_avx
+___
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
+	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	($done_avx_label);
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$C,@T[1]
+	mov	$D,12($ctx)
+	xor	$D,@T[1]
+	mov	$E,16($ctx)
+	and	@T[1],@T[0]
+	jmp	.Loop_avx
+
+.align	16
+$done_avx_label:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vzeroupper
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
+___
+$code.=<<___;
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
+.Lepilogue_avx:
+	ret
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+
+if ($avx>1) {
+use integer;
+$Xi=4;					# reset variables
+@X=map("%ymm$_",(4..7,0..3));
+@Tx=map("%ymm$_",(8..10));
+$Kx="%ymm11";
+$j=0;
+
+my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
+my ($a5,$t0)=("%r12d","%edi");
+
+my ($A,$F,$B,$C,$D,$E)=@ROTX;
+my $rx=0;
+my $frame="%r13";
+
+$code.=<<___;
+.type	sha1_block_data_order_avx2,\@function,3
+.align	16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+	mov	%rsp,$fp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	lea	-6*16(%rsp),%rsp
+	vmovaps	%xmm6,-40-6*16($fp)
+	vmovaps	%xmm7,-40-5*16($fp)
+	vmovaps	%xmm8,-40-4*16($fp)
+	vmovaps	%xmm9,-40-3*16($fp)
+	vmovaps	%xmm10,-40-2*16($fp)
+	vmovaps	%xmm11,-40-1*16($fp)
+.Lprologue_avx2:
+___
+$code.=<<___;
+	mov	%rdi,$ctx		# reassigned argument
+	mov	%rsi,$inp		# reassigned argument
+	mov	%rdx,$num		# reassigned argument
+
+	lea	-640(%rsp),%rsp
+	shl	\$6,$num
+	 lea	64($inp),$frame
+	and	\$-128,%rsp
+	add	$inp,$num
+	lea	K_XX_XX+64(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	 cmp	$num,$frame
+	 cmovae	$inp,$frame		# next or same block
+	mov	4($ctx),$F
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	16($ctx),$E
+	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
+
+	vmovdqu		($inp),%xmm0
+	vmovdqu		16($inp),%xmm1
+	vmovdqu		32($inp),%xmm2
+	vmovdqu		48($inp),%xmm3
+	lea		64($inp),$inp
+	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
+	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
+	vpshufb		@X[2],@X[-4&7],@X[-4&7]
+	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
+	vpshufb		@X[2],@X[-3&7],@X[-3&7]
+	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
+	vpshufb		@X[2],@X[-2&7],@X[-2&7]
+	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
+	vpshufb		@X[2],@X[-1&7],@X[-1&7]
+
+	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	$Kx,@X[-3&7],@X[1]
+	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vpaddd	$Kx,@X[-2&7],@X[2]
+	vmovdqu	@X[1],32(%rsp)
+	vpaddd	$Kx,@X[-1&7],@X[3]
+	vmovdqu	@X[2],64(%rsp)
+	vmovdqu	@X[3],96(%rsp)
+___
+for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
+    use integer;
+
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	&vpsrld	(@Tx[0],@X[0],31);
+	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	&vpaddd	(@Tx[1],@X[0],$Kx);
+	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+
+	push(@X,shift(@X));	# "rotate" X[]
+}
+$code.=<<___;
+	lea	128(%rsp),$frame
+	jmp	.Loop_avx2
+.align	32
+.Loop_avx2:
+	rorx	\$2,$F,$B
+	andn	$D,$F,$t0
+	and	$C,$F
+	xor	$t0,$F
+___
+sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
+	# at start $f=(b&c)^(~b&d), $b>>>=2
+	return &bodyx_20_39() if ($rx==19); $rx++;
+	(
+	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
+	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
+	'&andn	($t0,$a,$c)',			# ~b&d for next round
+
+	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
+	'&rorx	($a5,$a,27)',			# a<<<5
+	'&rorx	($f,$a,2)',			# b>>>2 for next round
+	'&and	($a,$b)',			# b&c for next round
+
+	'&add	($e,$a5)',			# e+=a<<<5
+	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
+
+	'unshift(@ROTX,pop(@ROTX)); $j++;'
+	)
+}
+
+sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
+	# on entry $f=b^c^d, $b>>>=2
+	return &bodyx_40_59() if ($rx==39); $rx++;
+	(
+	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
+	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
+
+	'&lea	($e,"($e,$f)")',		# e+=b^c^d
+	'&rorx	($a5,$a,27)',			# a<<<5
+	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
+	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
+
+	'&add	($e,$a5)',			# e+=a<<<5
+	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
+
+	'unshift(@ROTX,pop(@ROTX)); $j++;'
+	)
+}
+
+sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
+	# on entry $f=((b^c)&(c^d)), $b>>>=2
+	$rx++;
+	(
+	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
+	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
+	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
+	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
+	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
+
+	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
+	'&rorx	($a5,$a,27)',			# a<<<5
+	'&rorx	($f,$a,2)',			# b>>>2 in next round
+	'&xor	($a,$b)',			# b^c for next round
+
+	'&add	($e,$a5)',			# e+=a<<<5
+	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
+	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
+
+	'unshift(@ROTX,pop(@ROTX)); $j++;'
+	)
+}
+
+sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpaddd	(@Tx[1],@X[0],$Kx);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+	$Xi++;
+	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx2_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	#&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpaddd	(@Tx[1],@X[0],$Kx);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+	$Xi++;
+	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xloop_avx2()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 foreach (@insns) { eval; }
+}
+
+	&align32();
+	&Xupdate_avx2_32_79(\&bodyx_00_19);
+	&Xupdate_avx2_32_79(\&bodyx_00_19);
+	&Xupdate_avx2_32_79(\&bodyx_00_19);
+	&Xupdate_avx2_32_79(\&bodyx_00_19);
+
+	&Xupdate_avx2_32_79(\&bodyx_20_39);
+	&Xupdate_avx2_32_79(\&bodyx_20_39);
+	&Xupdate_avx2_32_79(\&bodyx_20_39);
+	&Xupdate_avx2_32_79(\&bodyx_20_39);
+
+	&align32();
+	&Xupdate_avx2_32_79(\&bodyx_40_59);
+	&Xupdate_avx2_32_79(\&bodyx_40_59);
+	&Xupdate_avx2_32_79(\&bodyx_40_59);
+	&Xupdate_avx2_32_79(\&bodyx_40_59);
+
+	&Xloop_avx2(\&bodyx_20_39);
+	&Xloop_avx2(\&bodyx_20_39);
+	&Xloop_avx2(\&bodyx_20_39);
+	&Xloop_avx2(\&bodyx_20_39);
+
+$code.=<<___;
+	lea	128($inp),$frame
+	lea	128($inp),%rdi			# borrow $t0
+	cmp	$num,$frame
+	cmovae	$inp,$frame			# next or previous block
+
+	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+	add	0($ctx),@ROTX[0]		# update context
+	add	4($ctx),@ROTX[1]
+	add	8($ctx),@ROTX[3]
+	mov	@ROTX[0],0($ctx)
+	add	12($ctx),@ROTX[4]
+	mov	@ROTX[1],4($ctx)
+	 mov	@ROTX[0],$A			# A=d
+	add	16($ctx),@ROTX[5]
+	 mov	@ROTX[3],$a5
+	mov	@ROTX[3],8($ctx)
+	 mov	@ROTX[4],$D			# D=b
+	 #xchg	@ROTX[5],$F			# F=c, C=f
+	mov	@ROTX[4],12($ctx)
+	 mov	@ROTX[1],$F			# F=e
+	mov	@ROTX[5],16($ctx)
+	#mov	$F,16($ctx)
+	 mov	@ROTX[5],$E			# E=c
+	 mov	$a5,$C				# C=f
+	 #xchg	$F,$E				# E=c, F=e
+
+	cmp	$num,$inp
+	je	.Ldone_avx2
+___
+
+$Xi=4;				# reset variables
+@X=map("%ymm$_",(4..7,0..3));
+
+$code.=<<___;
+	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
+	cmp	$num,%rdi			# borrowed $t0
+	ja	.Last_avx2
+
+	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
+	vmovdqu		-48(%rdi),%xmm1
+	vmovdqu		-32(%rdi),%xmm2
+	vmovdqu		-16(%rdi),%xmm3
+	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
+	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
+	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
+	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
+	jmp	.Last_avx2
+
+.align	32
+.Last_avx2:
+	lea	128+16(%rsp),$frame
+	rorx	\$2,$F,$B
+	andn	$D,$F,$t0
+	and	$C,$F
+	xor	$t0,$F
+	sub	\$-128,$inp
+___
+	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
+
+	&Xloop_avx2	(\&bodyx_00_19);
+	&Xloop_avx2	(\&bodyx_00_19);
+	&Xloop_avx2	(\&bodyx_00_19);
+	&Xloop_avx2	(\&bodyx_00_19);
+
+	&Xloop_avx2	(\&bodyx_20_39);
+	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
+	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&Xloop_avx2	(\&bodyx_20_39);
+	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
+	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
+	&Xloop_avx2	(\&bodyx_20_39);
+	  &vmovdqu	("0(%rsp)",@Tx[0]);
+	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
+	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
+	&Xloop_avx2	(\&bodyx_20_39);
+	  &vmovdqu	("32(%rsp)",@Tx[1]);
+	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
+	  &vpaddd	(@X[2],@X[-2&7],$Kx);
+
+	&Xloop_avx2	(\&bodyx_40_59);
+	&align32	();
+	  &vmovdqu	("64(%rsp)",@X[2]);
+	  &vpaddd	(@X[3],@X[-1&7],$Kx);
+	&Xloop_avx2	(\&bodyx_40_59);
+	  &vmovdqu	("96(%rsp)",@X[3]);
+	&Xloop_avx2	(\&bodyx_40_59);
+	&Xupdate_avx2_16_31(\&bodyx_40_59);
+
+	&Xupdate_avx2_16_31(\&bodyx_20_39);
+	&Xupdate_avx2_16_31(\&bodyx_20_39);
+	&Xupdate_avx2_16_31(\&bodyx_20_39);
+	&Xloop_avx2	(\&bodyx_20_39);
+
+$code.=<<___;
+	lea	128(%rsp),$frame
+
+	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+	add	0($ctx),@ROTX[0]		# update context
+	add	4($ctx),@ROTX[1]
+	add	8($ctx),@ROTX[3]
+	mov	@ROTX[0],0($ctx)
+	add	12($ctx),@ROTX[4]
+	mov	@ROTX[1],4($ctx)
+	 mov	@ROTX[0],$A			# A=d
+	add	16($ctx),@ROTX[5]
+	 mov	@ROTX[3],$a5
+	mov	@ROTX[3],8($ctx)
+	 mov	@ROTX[4],$D			# D=b
+	 #xchg	@ROTX[5],$F			# F=c, C=f
+	mov	@ROTX[4],12($ctx)
+	 mov	@ROTX[1],$F			# F=e
+	mov	@ROTX[5],16($ctx)
+	#mov	$F,16($ctx)
+	 mov	@ROTX[5],$E			# E=c
+	 mov	$a5,$C				# C=f
+	 #xchg	$F,$E				# E=c, F=e
+
+	cmp	$num,$inp
+	jbe	.Loop_avx2
+
+.Ldone_avx2:
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
+___
+$code.=<<___;
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
+.Lepilogue_avx2:
+	ret
+.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+___
+}
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+___
+}}}
+$code.=<<___;
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	mov	`16*4`(%rax),%rax	# pull saved stack pointer
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
+___
+
+$code.=<<___ if ($shaext);
+.type	shaext_handler,\@abi-omnipotent
+.align	16
+shaext_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	lea	.Lepilogue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	lea	-8-4*16(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lcommon_seh_tail
+.size	shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	208($context),%rax	# pull context->R11
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	-40-6*16(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$12,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore cotnext->R12
+	mov	%r13,224($context)	# restore cotnext->R13
+	mov	%r14,232($context)	# restore cotnext->R14
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_sha1_block_data_order
+	.rva	.LSEH_end_sha1_block_data_order
+	.rva	.LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
+	.rva	.LSEH_begin_sha1_block_data_order_shaext
+	.rva	.LSEH_end_sha1_block_data_order_shaext
+	.rva	.LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
+	.rva	.LSEH_end_sha1_block_data_order_ssse3
+	.rva	.LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_sha1_block_data_order_avx
+	.rva	.LSEH_end_sha1_block_data_order_avx
+	.rva	.LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_sha1_block_data_order_avx2
+	.rva	.LSEH_end_sha1_block_data_order_avx2
+	.rva	.LSEH_info_sha1_block_data_order_avx2
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_sha1_block_data_order:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_sha1_block_data_order_shaext:
+	.byte	9,0,0,0
+	.rva	shaext_handler
+___
+$code.=<<___;
+.LSEH_info_sha1_block_data_order_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_block_data_order_avx2:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
+___
+}
+
+####################################################################
+
+sub sha1rnds4 {
+    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x3a,0xcc);
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return "sha1rnds4\t".@_[0];
+    }
+}
+
+sub sha1op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha1nexte" => 0xc8,
+  		"sha1msg1"  => 0xc9,
+		"sha1msg2"  => 0xca	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x0f,0x38);
+      my $rex=0;
+	$rex|=0x04			if ($2>=8);
+	$rex|=0x01			if ($1>=8);
+	unshift @opcode,0x40|$rex	if ($rex);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
+	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha256-586.pl b/crypto/fipsmodule/sha/asm/sha256-586.pl
new file mode 100644
index 0000000..834d00f
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha256-586.pl

@@ -0,0 +1,1283 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 block transform for x86. September 2007.
+#
+# Performance improvement over compiler generated code varies from
+# 10% to 40% [see below]. Not very impressive on some µ-archs, but
+# it's 5 times smaller and optimizies amount of writes.
+#
+# May 2012.
+#
+# Optimization including two of Pavel Semjanov's ideas, alternative
+# Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
+# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
+# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
+# on P4, where it kills performance, nor Sandy Bridge, where folded
+# loop is approximately as fast...
+#
+# June 2012.
+#
+# Add AMD XOP-specific code path, >30% improvement on Bulldozer over
+# May version, >60% over original. Add AVX+shrd code path, >25%
+# improvement on Sandy Bridge over May version, 60% over original.
+#
+# May 2013.
+#
+# Replace AMD XOP code path with SSSE3 to cover more processors.
+# (Biggest improvement coefficient is on upcoming Atom Silvermont,
+# not shown.) Add AVX+BMI code path.
+#
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+#		gcc	icc	x86 asm(*)	SIMD	x86_64 asm(**)
+# Pentium	46	57	40/38		-	-
+# PIII		36	33	27/24		-	-
+# P4		41	38	28		-	17.3
+# AMD K8	27	25	19/15.5		-	14.9
+# Core2		26	23	18/15.6		14.3	13.8
+# Westmere	27	-	19/15.7		13.4	12.3
+# Sandy Bridge	25	-	15.9		12.4	11.6
+# Ivy Bridge	24	-	15.0		11.4	10.3
+# Haswell	22	-	13.9		9.46	7.80
+# Skylake	20	-	14.9		9.50	7.70
+# Bulldozer	36	-	27/22		17.0	13.6
+# VIA Nano	36	-	25/22		16.8	16.5
+# Atom		50	-	30/25		21.9	18.9
+# Silvermont	40	-	34/31		22.9	20.6
+# Goldmont	29	-	20		16.3(***)
+#
+# (*)	numbers after slash are for unrolled loop, where applicable;
+# (**)	x86_64 assembly performance is presented for reference
+#	purposes, results are best-available;
+# (***)	SHAEXT result is 4.1, strangely enough better than 64-bit one;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+
+$xmm=$avx=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+#
+# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
+$avx = 1;
+
+$avx = 0 unless ($xmm);
+
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
+# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
+# been tested.
+$shaext = 0;
+
+$unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
+			# fully unrolled loop was measured to run about
+			# 3-4x slower. If slowdown coefficient is N and
+			# unrolled loop is m times faster, then you break
+			# even at (N-1)/(m-1) blocks. Then it needs to be
+			# adjusted for probability of code being evicted,
+			# code size/cache size=1/4. Typical m is 1.15...
+
+$A="eax";
+$E="edx";
+$T="ebx";
+$Aoff=&DWP(4,"esp");
+$Boff=&DWP(8,"esp");
+$Coff=&DWP(12,"esp");
+$Doff=&DWP(16,"esp");
+$Eoff=&DWP(20,"esp");
+$Foff=&DWP(24,"esp");
+$Goff=&DWP(28,"esp");
+$Hoff=&DWP(32,"esp");
+$Xoff=&DWP(36,"esp");
+$K256="ebp";
+
+sub BODY_16_63() {
+	&mov	($T,"ecx");			# "ecx" is preloaded
+	 &mov	("esi",&DWP(4*(9+15+16-14),"esp"));
+	&ror	("ecx",18-7);
+	 &mov	("edi","esi");
+	&ror	("esi",19-17);
+	 &xor	("ecx",$T);
+	 &shr	($T,3);
+	&ror	("ecx",7);
+	 &xor	("esi","edi");
+	 &xor	($T,"ecx");			# T = sigma0(X[-15])
+	&ror	("esi",17);
+	 &add	($T,&DWP(4*(9+15+16),"esp"));	# T += X[-16]
+	&shr	("edi",10);
+	 &add	($T,&DWP(4*(9+15+16-9),"esp"));	# T += X[-7]
+	#&xor	("edi","esi")			# sigma1(X[-2])
+	# &add	($T,"edi");			# T += sigma1(X[-2])
+	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
+
+	&BODY_00_15(1);
+}
+sub BODY_00_15() {
+    my $in_16_63=shift;
+
+	&mov	("ecx",$E);
+	 &xor	("edi","esi")			if ($in_16_63);	# sigma1(X[-2])
+	 &mov	("esi",$Foff);
+	&ror	("ecx",25-11);
+	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
+	 &mov	("edi",$Goff);
+	&xor	("ecx",$E);
+	 &xor	("esi","edi");
+	 &mov	($T,&DWP(4*(9+15),"esp"))	if (!$in_16_63);
+	 &mov	(&DWP(4*(9+15),"esp"),$T)	if ($in_16_63);	# save X[0]
+	&ror	("ecx",11-6);
+	 &and	("esi",$E);
+	 &mov	($Eoff,$E);		# modulo-scheduled
+	&xor	($E,"ecx");
+	 &add	($T,$Hoff);		# T += h
+	 &xor	("esi","edi");		# Ch(e,f,g)
+	&ror	($E,6);			# Sigma1(e)
+	 &mov	("ecx",$A);
+	 &add	($T,"esi");		# T += Ch(e,f,g)
+
+	&ror	("ecx",22-13);
+	 &add	($T,$E);		# T += Sigma1(e)
+	 &mov	("edi",$Boff);
+	&xor	("ecx",$A);
+	 &mov	($Aoff,$A);		# modulo-scheduled
+	 &lea	("esp",&DWP(-4,"esp"));
+	&ror	("ecx",13-2);
+	 &mov	("esi",&DWP(0,$K256));
+	&xor	("ecx",$A);
+	 &mov	($E,$Eoff);		# e in next iteration, d in this one
+	 &xor	($A,"edi");		# a ^= b
+	&ror	("ecx",2);		# Sigma0(a)
+
+	 &add	($T,"esi");		# T+= K[i]
+	 &mov	(&DWP(0,"esp"),$A);	# (b^c) in next round
+	&add	($E,$T);		# d += T
+	 &and	($A,&DWP(4,"esp"));	# a &= (b^c)
+	&add	($T,"ecx");		# T += Sigma0(a)
+	 &xor	($A,"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
+	 &mov	("ecx",&DWP(4*(9+15+16-1),"esp"))	if ($in_16_63);	# preload T
+	&add	($K256,4);
+	 &add	($A,$T);		# h += T
+}
+
+&external_label("OPENSSL_ia32cap_P")		if (!$i386);
+
+&function_begin("sha256_block_data_order");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+						if (!$i386 && $xmm) {
+	&picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
+	&mov	("ecx",&DWP(0,"edx"));
+	&mov	("ebx",&DWP(4,"edx"));
+	&test	("ecx",1<<20);		# check for P4
+	&jnz	(&label("loop"));
+	&mov	("edx",&DWP(8,"edx"))	if ($xmm);
+	&test	("ecx",1<<24);		# check for FXSR
+	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
+	&and	("ecx",1<<30);		# mask "Intel CPU" bit
+	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
+	&test	("edx",1<<29)		if ($shaext);	# check for SHA
+	&jnz	(&label("shaext"))	if ($shaext);
+	&or	("ecx","ebx");
+	&and	("ecx",1<<28|1<<30);
+	&cmp	("ecx",1<<28|1<<30);
+					if ($xmm) {
+	&je	(&label("AVX"))		if ($avx);
+	&test	("ebx",1<<9);		# check for SSSE3
+	&jnz	(&label("SSSE3"));
+					} else {
+	&je	(&label("loop_shrd"));
+					}
+						if ($unroll_after) {
+&set_label("no_xmm");
+	&sub	("eax","edi");
+	&cmp	("eax",$unroll_after);
+	&jae	(&label("unrolled"));
+						} }
+	&jmp	(&label("loop"));
+
+sub COMPACT_LOOP() {
+my $suffix=shift;
+
+&set_label("loop$suffix",$suffix?32:16);
+    # copy input block to stack reversing byte and dword order
+    for($i=0;$i<4;$i++) {
+	&mov	("eax",&DWP($i*16+0,"edi"));
+	&mov	("ebx",&DWP($i*16+4,"edi"));
+	&mov	("ecx",&DWP($i*16+8,"edi"));
+	&bswap	("eax");
+	&mov	("edx",&DWP($i*16+12,"edi"));
+	&bswap	("ebx");
+	&push	("eax");
+	&bswap	("ecx");
+	&push	("ebx");
+	&bswap	("edx");
+	&push	("ecx");
+	&push	("edx");
+    }
+	&add	("edi",64);
+	&lea	("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
+	&mov	(&DWP(4*(9+16)+4,"esp"),"edi");
+
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&mov	($A,&DWP(0,"esi"));
+	&mov	("ebx",&DWP(4,"esi"));
+	&mov	("ecx",&DWP(8,"esi"));
+	&mov	("edi",&DWP(12,"esi"));
+	# &mov	($Aoff,$A);
+	&mov	($Boff,"ebx");
+	&xor	("ebx","ecx");
+	&mov	($Coff,"ecx");
+	&mov	($Doff,"edi");
+	&mov	(&DWP(0,"esp"),"ebx");	# magic
+	&mov	($E,&DWP(16,"esi"));
+	&mov	("ebx",&DWP(20,"esi"));
+	&mov	("ecx",&DWP(24,"esi"));
+	&mov	("edi",&DWP(28,"esi"));
+	# &mov	($Eoff,$E);
+	&mov	($Foff,"ebx");
+	&mov	($Goff,"ecx");
+	&mov	($Hoff,"edi");
+
+&set_label("00_15$suffix",16);
+
+	&BODY_00_15();
+
+	&cmp	("esi",0xc19bf174);
+	&jne	(&label("00_15$suffix"));
+
+	&mov	("ecx",&DWP(4*(9+15+16-1),"esp"));	# preloaded in BODY_00_15(1)
+	&jmp	(&label("16_63$suffix"));
+
+&set_label("16_63$suffix",16);
+
+	&BODY_16_63();
+
+	&cmp	("esi",0xc67178f2);
+	&jne	(&label("16_63$suffix"));
+
+	&mov	("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
+	# &mov	($A,$Aoff);
+	&mov	("ebx",$Boff);
+	# &mov	("edi",$Coff);
+	&mov	("ecx",$Doff);
+	&add	($A,&DWP(0,"esi"));
+	&add	("ebx",&DWP(4,"esi"));
+	&add	("edi",&DWP(8,"esi"));
+	&add	("ecx",&DWP(12,"esi"));
+	&mov	(&DWP(0,"esi"),$A);
+	&mov	(&DWP(4,"esi"),"ebx");
+	&mov	(&DWP(8,"esi"),"edi");
+	&mov	(&DWP(12,"esi"),"ecx");
+	# &mov	($E,$Eoff);
+	&mov	("eax",$Foff);
+	&mov	("ebx",$Goff);
+	&mov	("ecx",$Hoff);
+	&mov	("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
+	&add	($E,&DWP(16,"esi"));
+	&add	("eax",&DWP(20,"esi"));
+	&add	("ebx",&DWP(24,"esi"));
+	&add	("ecx",&DWP(28,"esi"));
+	&mov	(&DWP(16,"esi"),$E);
+	&mov	(&DWP(20,"esi"),"eax");
+	&mov	(&DWP(24,"esi"),"ebx");
+	&mov	(&DWP(28,"esi"),"ecx");
+
+	&lea	("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
+	&sub	($K256,4*64);			# rewind K
+
+	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
+	&jb	(&label("loop$suffix"));
+}
+	&COMPACT_LOOP();
+	&mov	("esp",&DWP(12,"esp"));		# restore sp
+&function_end_A();
+						if (!$i386 && !$xmm) {
+	# ~20% improvement on Sandy Bridge
+	local *ror = sub { &shrd(@_[0],@_) };
+	&COMPACT_LOOP("_shrd");
+	&mov	("esp",&DWP(12,"esp"));		# restore sp
+&function_end_A();
+						}
+
+&set_label("K256",64);	# Yes! I keep it in the code segment!
+@K256=(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
+	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
+	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
+	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
+	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
+	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
+	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
+	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
+	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
+	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
+	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
+	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
+	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
+	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
+	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2	);
+&data_word(@K256);
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# byte swap mask
+&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+($a,$b,$c,$d,$e,$f,$g,$h)=(0..7);	# offsets
+sub off { &DWP(4*(((shift)-$i)&7),"esp"); }
+
+if (!$i386 && $unroll_after) {
+my @AH=($A,$K256);
+
+&set_label("unrolled",16);
+	&lea	("esp",&DWP(-96,"esp"));
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&mov	($AH[0],&DWP(0,"esi"));
+	&mov	($AH[1],&DWP(4,"esi"));
+	&mov	("ecx",&DWP(8,"esi"));
+	&mov	("ebx",&DWP(12,"esi"));
+	#&mov	(&DWP(0,"esp"),$AH[0]);
+	&mov	(&DWP(4,"esp"),$AH[1]);
+	&xor	($AH[1],"ecx");		# magic
+	&mov	(&DWP(8,"esp"),"ecx");
+	&mov	(&DWP(12,"esp"),"ebx");
+	&mov	($E,&DWP(16,"esi"));
+	&mov	("ebx",&DWP(20,"esi"));
+	&mov	("ecx",&DWP(24,"esi"));
+	&mov	("esi",&DWP(28,"esi"));
+	#&mov	(&DWP(16,"esp"),$E);
+	&mov	(&DWP(20,"esp"),"ebx");
+	&mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esp"),"esi");
+	&jmp	(&label("grand_loop"));
+
+&set_label("grand_loop",16);
+    # copy input block to stack reversing byte order
+    for($i=0;$i<5;$i++) {
+	&mov	("ebx",&DWP(12*$i+0,"edi"));
+	&mov	("ecx",&DWP(12*$i+4,"edi"));
+	&bswap	("ebx");
+	&mov	("esi",&DWP(12*$i+8,"edi"));
+	&bswap	("ecx");
+	&mov	(&DWP(32+12*$i+0,"esp"),"ebx");
+	&bswap	("esi");
+	&mov	(&DWP(32+12*$i+4,"esp"),"ecx");
+	&mov	(&DWP(32+12*$i+8,"esp"),"esi");
+    }
+	&mov	("ebx",&DWP($i*12,"edi"));
+	&add	("edi",64);
+	&bswap	("ebx");
+	&mov	(&DWP(96+4,"esp"),"edi");
+	&mov	(&DWP(32+12*$i,"esp"),"ebx");
+
+    my ($t1,$t2) = ("ecx","esi");
+
+    for ($i=0;$i<64;$i++) {
+
+      if ($i>=16) {
+	&mov	($T,$t1);			# $t1 is preloaded
+	# &mov	($t2,&DWP(32+4*(($i+14)&15),"esp"));
+	&ror	($t1,18-7);
+	 &mov	("edi",$t2);
+	&ror	($t2,19-17);
+	 &xor	($t1,$T);
+	 &shr	($T,3);
+	&ror	($t1,7);
+	 &xor	($t2,"edi");
+	 &xor	($T,$t1);			# T = sigma0(X[-15])
+	&ror	($t2,17);
+	 &add	($T,&DWP(32+4*($i&15),"esp"));	# T += X[-16]
+	&shr	("edi",10);
+	 &add	($T,&DWP(32+4*(($i+9)&15),"esp"));	# T += X[-7]
+	#&xor	("edi",$t2)			# sigma1(X[-2])
+	# &add	($T,"edi");			# T += sigma1(X[-2])
+	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
+      }
+	&mov	($t1,$E);
+	 &xor	("edi",$t2)			if ($i>=16);	# sigma1(X[-2])
+	 &mov	($t2,&off($f));
+	&ror	($E,25-11);
+	 &add	($T,"edi")			if ($i>=16);	# T += sigma1(X[-2])
+	 &mov	("edi",&off($g));
+	&xor	($E,$t1);
+	 &mov	($T,&DWP(32+4*($i&15),"esp"))	if ($i<16);	# X[i]
+	 &mov	(&DWP(32+4*($i&15),"esp"),$T)	if ($i>=16 && $i<62);	# save X[0]
+	 &xor	($t2,"edi");
+	&ror	($E,11-6);
+	 &and	($t2,$t1);
+	 &mov	(&off($e),$t1);		# save $E, modulo-scheduled
+	&xor	($E,$t1);
+	 &add	($T,&off($h));		# T += h
+	 &xor	("edi",$t2);		# Ch(e,f,g)
+	&ror	($E,6);			# Sigma1(e)
+	 &mov	($t1,$AH[0]);
+	 &add	($T,"edi");		# T += Ch(e,f,g)
+
+	&ror	($t1,22-13);
+	 &mov	($t2,$AH[0]);
+	 &mov	("edi",&off($b));
+	&xor	($t1,$AH[0]);
+	 &mov	(&off($a),$AH[0]);	# save $A, modulo-scheduled
+	 &xor	($AH[0],"edi");		# a ^= b, (b^c) in next round
+	&ror	($t1,13-2);
+	 &and	($AH[1],$AH[0]);	# (b^c) &= (a^b)
+	 &lea	($E,&DWP(@K256[$i],$T,$E));	# T += Sigma1(1)+K[i]
+	&xor	($t1,$t2);
+	 &xor	($AH[1],"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
+	 &mov	($t2,&DWP(32+4*(($i+2)&15),"esp"))	if ($i>=15 && $i<63);
+	&ror	($t1,2);		# Sigma0(a)
+
+	 &add	($AH[1],$E);		# h += T
+	 &add	($E,&off($d));		# d += T
+	&add	($AH[1],$t1);		# h += Sigma0(a)
+	 &mov	($t1,&DWP(32+4*(($i+15)&15),"esp"))	if ($i>=15 && $i<63);
+
+	@AH = reverse(@AH);		# rotate(a,h)
+	($t1,$t2) = ($t2,$t1);		# rotate(t1,t2)
+    }
+	&mov	("esi",&DWP(96,"esp"));	#ctx
+					#&mov	($AH[0],&DWP(0,"esp"));
+	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
+					#&mov	("edi", &DWP(8,"esp"));
+	&mov	("ecx",&DWP(12,"esp"));
+	&add	($AH[0],&DWP(0,"esi"));
+	&add	($AH[1],&DWP(4,"esi"));
+	&add	("edi",&DWP(8,"esi"));
+	&add	("ecx",&DWP(12,"esi"));
+	&mov	(&DWP(0,"esi"),$AH[0]);
+	&mov	(&DWP(4,"esi"),$AH[1]);
+	&mov	(&DWP(8,"esi"),"edi");
+	&mov	(&DWP(12,"esi"),"ecx");
+	 #&mov	(&DWP(0,"esp"),$AH[0]);
+	 &mov	(&DWP(4,"esp"),$AH[1]);
+	 &xor	($AH[1],"edi");		# magic
+	 &mov	(&DWP(8,"esp"),"edi");
+	 &mov	(&DWP(12,"esp"),"ecx");
+	#&mov	($E,&DWP(16,"esp"));
+	&mov	("edi",&DWP(20,"esp"));
+	&mov	("ebx",&DWP(24,"esp"));
+	&mov	("ecx",&DWP(28,"esp"));
+	&add	($E,&DWP(16,"esi"));
+	&add	("edi",&DWP(20,"esi"));
+	&add	("ebx",&DWP(24,"esi"));
+	&add	("ecx",&DWP(28,"esi"));
+	&mov	(&DWP(16,"esi"),$E);
+	&mov	(&DWP(20,"esi"),"edi");
+	&mov	(&DWP(24,"esi"),"ebx");
+	&mov	(&DWP(28,"esi"),"ecx");
+	 #&mov	(&DWP(16,"esp"),$E);
+	 &mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+	 &mov	(&DWP(24,"esp"),"ebx");
+	 &mov	(&DWP(28,"esp"),"ecx");
+
+	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
+	&jb	(&label("grand_loop"));
+
+	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
+&function_end_A();
+}
+						if (!$i386 && $xmm) {{{
+if ($shaext) {
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$end)=("esi","edi","eax");
+my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
+my @MSG=map("xmm$_",(3..6));
+
+sub sha256op38 {
+ my ($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
+}
+sub sha256rnds2	{ sha256op38(0xcb,@_); }
+sub sha256msg1	{ sha256op38(0xcc,@_); }
+sub sha256msg2	{ sha256op38(0xcd,@_); }
+
+&set_label("shaext",32);
+	&sub		("esp",32);
+
+	&movdqu		($ABEF,&QWP(0,$ctx));		# DCBA
+	&lea		($K256,&DWP(0x80,$K256));
+	&movdqu		($CDGH,&QWP(16,$ctx));		# HGFE
+	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
+
+	&pshufd		($Wi,$ABEF,0x1b);		# ABCD
+	&pshufd		($ABEF,$ABEF,0xb1);		# CDAB
+	&pshufd		($CDGH,$CDGH,0x1b);		# EFGH
+	&palignr	($ABEF,$CDGH,8);		# ABEF
+	&punpcklqdq	($CDGH,$Wi);			# CDGH
+	&jmp		(&label("loop_shaext"));
+
+&set_label("loop_shaext",16);
+	&movdqu		(@MSG[0],&QWP(0,$inp));
+	&movdqu		(@MSG[1],&QWP(0x10,$inp));
+	&movdqu		(@MSG[2],&QWP(0x20,$inp));
+	&pshufb		(@MSG[0],$TMP);
+	&movdqu		(@MSG[3],&QWP(0x30,$inp));
+	&movdqa		(&QWP(16,"esp"),$CDGH);		# offload
+
+	&movdqa		($Wi,&QWP(0*16-0x80,$K256));
+	&paddd		($Wi,@MSG[0]);
+	&pshufb		(@MSG[1],$TMP);
+	&sha256rnds2	($CDGH,$ABEF);			# 0-3
+	&pshufd		($Wi,$Wi,0x0e);
+	&nop		();
+	&movdqa		(&QWP(0,"esp"),$ABEF);		# offload
+	&sha256rnds2	($ABEF,$CDGH);
+
+	&movdqa		($Wi,&QWP(1*16-0x80,$K256));
+	&paddd		($Wi,@MSG[1]);
+	&pshufb		(@MSG[2],$TMP);
+	&sha256rnds2	($CDGH,$ABEF);			# 4-7
+	&pshufd		($Wi,$Wi,0x0e);
+	&lea		($inp,&DWP(0x40,$inp));
+	&sha256msg1	(@MSG[0],@MSG[1]);
+	&sha256rnds2	($ABEF,$CDGH);
+
+	&movdqa		($Wi,&QWP(2*16-0x80,$K256));
+	&paddd		($Wi,@MSG[2]);
+	&pshufb		(@MSG[3],$TMP);
+	&sha256rnds2	($CDGH,$ABEF);			# 8-11
+	&pshufd		($Wi,$Wi,0x0e);
+	&movdqa		($TMP,@MSG[3]);
+	&palignr	($TMP,@MSG[2],4);
+	&nop		();
+	&paddd		(@MSG[0],$TMP);
+	&sha256msg1	(@MSG[1],@MSG[2]);
+	&sha256rnds2	($ABEF,$CDGH);
+
+	&movdqa		($Wi,&QWP(3*16-0x80,$K256));
+	&paddd		($Wi,@MSG[3]);
+	&sha256msg2	(@MSG[0],@MSG[3]);
+	&sha256rnds2	($CDGH,$ABEF);			# 12-15
+	&pshufd		($Wi,$Wi,0x0e);
+	&movdqa		($TMP,@MSG[0]);
+	&palignr	($TMP,@MSG[3],4);
+	&nop		();
+	&paddd		(@MSG[1],$TMP);
+	&sha256msg1	(@MSG[2],@MSG[3]);
+	&sha256rnds2	($ABEF,$CDGH);
+
+for($i=4;$i<16-3;$i++) {
+	&movdqa		($Wi,&QWP($i*16-0x80,$K256));
+	&paddd		($Wi,@MSG[0]);
+	&sha256msg2	(@MSG[1],@MSG[0]);
+	&sha256rnds2	($CDGH,$ABEF);			# 16-19...
+	&pshufd		($Wi,$Wi,0x0e);
+	&movdqa		($TMP,@MSG[1]);
+	&palignr	($TMP,@MSG[0],4);
+	&nop		();
+	&paddd		(@MSG[2],$TMP);
+	&sha256msg1	(@MSG[3],@MSG[0]);
+	&sha256rnds2	($ABEF,$CDGH);
+
+	push(@MSG,shift(@MSG));
+}
+	&movdqa		($Wi,&QWP(13*16-0x80,$K256));
+	&paddd		($Wi,@MSG[0]);
+	&sha256msg2	(@MSG[1],@MSG[0]);
+	&sha256rnds2	($CDGH,$ABEF);			# 52-55
+	&pshufd		($Wi,$Wi,0x0e);
+	&movdqa		($TMP,@MSG[1])
+	&palignr	($TMP,@MSG[0],4);
+	&sha256rnds2	($ABEF,$CDGH);
+	&paddd		(@MSG[2],$TMP);
+
+	&movdqa		($Wi,&QWP(14*16-0x80,$K256));
+	&paddd		($Wi,@MSG[1]);
+	&sha256rnds2	($CDGH,$ABEF);			# 56-59
+	&pshufd		($Wi,$Wi,0x0e);
+	&sha256msg2	(@MSG[2],@MSG[1]);
+	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
+	&sha256rnds2	($ABEF,$CDGH);
+
+	&movdqa		($Wi,&QWP(15*16-0x80,$K256));
+	&paddd		($Wi,@MSG[2]);
+	&nop		();
+	&sha256rnds2	($CDGH,$ABEF);			# 60-63
+	&pshufd		($Wi,$Wi,0x0e);
+	&cmp		($end,$inp);
+	&nop		();
+	&sha256rnds2	($ABEF,$CDGH);
+
+	&paddd		($CDGH,&QWP(16,"esp"));
+	&paddd		($ABEF,&QWP(0,"esp"));
+	&jnz		(&label("loop_shaext"));
+
+	&pshufd		($CDGH,$CDGH,0xb1);		# DCHG
+	&pshufd		($TMP,$ABEF,0x1b);		# FEBA
+	&pshufd		($ABEF,$ABEF,0xb1);		# BAFE
+	&punpckhqdq	($ABEF,$CDGH);			# DCBA
+	&palignr	($CDGH,$TMP,8);			# HGFE
+
+	&mov		("esp",&DWP(32+12,"esp"));
+	&movdqu		(&QWP(0,$ctx),$ABEF);
+	&movdqu		(&QWP(16,$ctx),$CDGH);
+&function_end_A();
+}
+
+my @X = map("xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
+my @AH = ($A,$T);
+
+&set_label("SSSE3",32);
+	&lea	("esp",&DWP(-96,"esp"));
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&mov	($AH[0],&DWP(0,"esi"));
+	&mov	($AH[1],&DWP(4,"esi"));
+	&mov	("ecx",&DWP(8,"esi"));
+	&mov	("edi",&DWP(12,"esi"));
+	#&mov	(&DWP(0,"esp"),$AH[0]);
+	&mov	(&DWP(4,"esp"),$AH[1]);
+	&xor	($AH[1],"ecx");			# magic
+	&mov	(&DWP(8,"esp"),"ecx");
+	&mov	(&DWP(12,"esp"),"edi");
+	&mov	($E,&DWP(16,"esi"));
+	&mov	("edi",&DWP(20,"esi"));
+	&mov	("ecx",&DWP(24,"esi"));
+	&mov	("esi",&DWP(28,"esi"));
+	#&mov	(&DWP(16,"esp"),$E);
+	&mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+	&mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esp"),"esi");
+	&movdqa	($t3,&QWP(256,$K256));
+	&jmp	(&label("grand_ssse3"));
+
+&set_label("grand_ssse3",16);
+	# load input, reverse byte order, add K256[0..15], save to stack
+	&movdqu	(@X[0],&QWP(0,"edi"));
+	&movdqu	(@X[1],&QWP(16,"edi"));
+	&movdqu	(@X[2],&QWP(32,"edi"));
+	&movdqu	(@X[3],&QWP(48,"edi"));
+	&add	("edi",64);
+	&pshufb	(@X[0],$t3);
+	&mov	(&DWP(96+4,"esp"),"edi");
+	&pshufb	(@X[1],$t3);
+	&movdqa	($t0,&QWP(0,$K256));
+	&pshufb	(@X[2],$t3);
+	&movdqa	($t1,&QWP(16,$K256));
+	&paddd	($t0,@X[0]);
+	&pshufb	(@X[3],$t3);
+	&movdqa	($t2,&QWP(32,$K256));
+	&paddd	($t1,@X[1]);
+	&movdqa	($t3,&QWP(48,$K256));
+	&movdqa	(&QWP(32+0,"esp"),$t0);
+	&paddd	($t2,@X[2]);
+	&movdqa	(&QWP(32+16,"esp"),$t1);
+	&paddd	($t3,@X[3]);
+	&movdqa	(&QWP(32+32,"esp"),$t2);
+	&movdqa	(&QWP(32+48,"esp"),$t3);
+	&jmp	(&label("ssse3_00_47"));
+
+&set_label("ssse3_00_47",16);
+	&add		($K256,64);
+
+sub SSSE3_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
+
+	  eval(shift(@insns));
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],4);		# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],4);		# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&psrld		($t0,3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&psrld		($t2,7);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[14..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,32-18);
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&psrld		($t2,18-7);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,18-7);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,10);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrlq		($t2,17);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrlq		($t2,19-17);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));			# @
+	 &psrld		($t3,10);
+	  eval(shift(@insns));
+	 &psrlq		($t2,17);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrlq		($t2,19-17);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&movdqa		($t2,&QWP(16*$j,$K256));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));			# @
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		($t2,@X[0]);
+	  eval(shift(@insns));			# @
+
+	foreach (@insns) { eval; }		# remaining instructions
+
+	&movdqa		(&QWP(32+16*$j,"esp"),$t2);
+}
+
+sub body_00_15 () {
+	(
+	'&mov	("ecx",$E);',
+	'&ror	($E,25-11);',
+	 '&mov	("esi",&off($f));',
+	'&xor	($E,"ecx");',
+	 '&mov	("edi",&off($g));',
+	 '&xor	("esi","edi");',
+	'&ror	($E,11-6);',
+	 '&and	("esi","ecx");',
+	 '&mov	(&off($e),"ecx");',	# save $E, modulo-scheduled
+	'&xor	($E,"ecx");',
+	 '&xor	("edi","esi");',	# Ch(e,f,g)
+	'&ror	($E,6);',		# T = Sigma1(e)
+	 '&mov	("ecx",$AH[0]);',
+	 '&add	($E,"edi");',		# T += Ch(e,f,g)
+	 '&mov	("edi",&off($b));',
+	'&mov	("esi",$AH[0]);',
+
+	'&ror	("ecx",22-13);',
+	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
+	'&xor	("ecx",$AH[0]);',
+	 '&xor	($AH[0],"edi");',	# a ^= b, (b^c) in next round
+	 '&add	($E,&off($h));',	# T += h
+	'&ror	("ecx",13-2);',
+	 '&and	($AH[1],$AH[0]);',	# (b^c) &= (a^b)
+	'&xor	("ecx","esi");',
+	 '&add	($E,&DWP(32+4*($i&15),"esp"));',	# T += K[i]+X[i]
+	 '&xor	($AH[1],"edi");',	# h = Maj(a,b,c) = Ch(a^b,c,b)
+	'&ror	("ecx",2);',		# Sigma0(a)
+
+	 '&add	($AH[1],$E);',		# h += T
+	 '&add	($E,&off($d));',	# d += T
+	'&add	($AH[1],"ecx");'.	# h += Sigma0(a)
+
+	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
+	);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));		# rotate(@X)
+    }
+	&cmp	(&DWP(16*$j,$K256),0x00010203);
+	&jne	(&label("ssse3_00_47"));
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+
+	&mov	("esi",&DWP(96,"esp"));	#ctx
+					#&mov	($AH[0],&DWP(0,"esp"));
+	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
+					#&mov	("edi", &DWP(8,"esp"));
+	&mov	("ecx",&DWP(12,"esp"));
+	&add	($AH[0],&DWP(0,"esi"));
+	&add	($AH[1],&DWP(4,"esi"));
+	&add	("edi",&DWP(8,"esi"));
+	&add	("ecx",&DWP(12,"esi"));
+	&mov	(&DWP(0,"esi"),$AH[0]);
+	&mov	(&DWP(4,"esi"),$AH[1]);
+	&mov	(&DWP(8,"esi"),"edi");
+	&mov	(&DWP(12,"esi"),"ecx");
+	 #&mov	(&DWP(0,"esp"),$AH[0]);
+	 &mov	(&DWP(4,"esp"),$AH[1]);
+	 &xor	($AH[1],"edi");			# magic
+	 &mov	(&DWP(8,"esp"),"edi");
+	 &mov	(&DWP(12,"esp"),"ecx");
+	#&mov	($E,&DWP(16,"esp"));
+	&mov	("edi",&DWP(20,"esp"));
+	&mov	("ecx",&DWP(24,"esp"));
+	&add	($E,&DWP(16,"esi"));
+	&add	("edi",&DWP(20,"esi"));
+	&add	("ecx",&DWP(24,"esi"));
+	&mov	(&DWP(16,"esi"),$E);
+	&mov	(&DWP(20,"esi"),"edi");
+	 &mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(28,"esp"));
+	&mov	(&DWP(24,"esi"),"ecx");
+	 #&mov	(&DWP(16,"esp"),$E);
+	&add	("edi",&DWP(28,"esi"));
+	 &mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esi"),"edi");
+	 &mov	(&DWP(28,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+
+	&movdqa	($t3,&QWP(64,$K256));
+	&sub	($K256,3*64);			# rewind K
+	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
+	&jb	(&label("grand_ssse3"));
+
+	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
+&function_end_A();
+						if ($avx) {
+&set_label("AVX",32);
+						if ($avx>1) {
+	&and	("edx",1<<8|1<<3);		# check for BMI2+BMI1
+	&cmp	("edx",1<<8|1<<3);
+	&je	(&label("AVX_BMI"));
+						}
+	&lea	("esp",&DWP(-96,"esp"));
+	&vzeroall	();
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&mov	($AH[0],&DWP(0,"esi"));
+	&mov	($AH[1],&DWP(4,"esi"));
+	&mov	("ecx",&DWP(8,"esi"));
+	&mov	("edi",&DWP(12,"esi"));
+	#&mov	(&DWP(0,"esp"),$AH[0]);
+	&mov	(&DWP(4,"esp"),$AH[1]);
+	&xor	($AH[1],"ecx");			# magic
+	&mov	(&DWP(8,"esp"),"ecx");
+	&mov	(&DWP(12,"esp"),"edi");
+	&mov	($E,&DWP(16,"esi"));
+	&mov	("edi",&DWP(20,"esi"));
+	&mov	("ecx",&DWP(24,"esi"));
+	&mov	("esi",&DWP(28,"esi"));
+	#&mov	(&DWP(16,"esp"),$E);
+	&mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+	&mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esp"),"esi");
+	&vmovdqa	($t3,&QWP(256,$K256));
+	&jmp	(&label("grand_avx"));
+
+&set_label("grand_avx",32);
+	# load input, reverse byte order, add K256[0..15], save to stack
+	&vmovdqu	(@X[0],&QWP(0,"edi"));
+	&vmovdqu	(@X[1],&QWP(16,"edi"));
+	&vmovdqu	(@X[2],&QWP(32,"edi"));
+	&vmovdqu	(@X[3],&QWP(48,"edi"));
+	&add		("edi",64);
+	&vpshufb	(@X[0],@X[0],$t3);
+	&mov		(&DWP(96+4,"esp"),"edi");
+	&vpshufb	(@X[1],@X[1],$t3);
+	&vpshufb	(@X[2],@X[2],$t3);
+	&vpaddd		($t0,@X[0],&QWP(0,$K256));
+	&vpshufb	(@X[3],@X[3],$t3);
+	&vpaddd		($t1,@X[1],&QWP(16,$K256));
+	&vpaddd		($t2,@X[2],&QWP(32,$K256));
+	&vpaddd		($t3,@X[3],&QWP(48,$K256));
+	&vmovdqa	(&QWP(32+0,"esp"),$t0);
+	&vmovdqa	(&QWP(32+16,"esp"),$t1);
+	&vmovdqa	(&QWP(32+32,"esp"),$t2);
+	&vmovdqa	(&QWP(32+48,"esp"),$t3);
+	&jmp		(&label("avx_00_47"));
+
+&set_label("avx_00_47",16);
+	&add		($K256,64);
+
+sub Xupdate_AVX () {
+	(
+	'&vpalignr	($t0,@X[1],@X[0],4);',	# X[1..4]
+	 '&vpalignr	($t3,@X[3],@X[2],4);',	# X[9..12]
+	'&vpsrld	($t2,$t0,7);',
+	 '&vpaddd	(@X[0],@X[0],$t3);',	# X[0..3] += X[9..16]
+	'&vpsrld	($t3,$t0,3);',
+	'&vpslld	($t1,$t0,14);',
+	'&vpxor		($t0,$t3,$t2);',
+	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&vpsrld	($t2,$t2,18-7);',
+	'&vpxor		($t0,$t0,$t1);',
+	'&vpslld	($t1,$t1,25-14);',
+	'&vpxor		($t0,$t0,$t2);',
+	 '&vpsrld	($t2,$t3,10);',
+	'&vpxor		($t0,$t0,$t1);',	# sigma0(X[1..4])
+	 '&vpsrlq	($t1,$t3,17);',
+	'&vpaddd	(@X[0],@X[0],$t0);',	# X[0..3] += sigma0(X[1..4])
+	 '&vpxor	($t2,$t2,$t1);',
+	 '&vpsrlq	($t3,$t3,19);',
+	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[14..15]
+	 '&vpshufd	($t3,$t2,0b10000100);',
+	'&vpsrldq	($t3,$t3,8);',
+	'&vpaddd	(@X[0],@X[0],$t3);',	# X[0..1] += sigma1(X[14..15])
+	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&vpsrld	($t2,$t3,10);',
+	 '&vpsrlq	($t1,$t3,17);',
+	 '&vpxor	($t2,$t2,$t1);',
+	 '&vpsrlq	($t3,$t3,19);',
+	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[16..17]
+	 '&vpshufd	($t3,$t2,0b11101000);',
+	'&vpslldq	($t3,$t3,8);',
+	'&vpaddd	(@X[0],@X[0],$t3);'	# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+local *ror = sub { &shrd(@_[0],@_) };
+sub AVX_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
+my $insn;
+
+	foreach (Xupdate_AVX()) {		# 31 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval($insn = shift(@insns));
+	    eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
+	}
+	&vpaddd		($t2,@X[0],&QWP(16*$j,$K256));
+	foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(&QWP(32+16*$j,"esp"),$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&AVX_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));		# rotate(@X)
+    }
+	&cmp	(&DWP(16*$j,$K256),0x00010203);
+	&jne	(&label("avx_00_47"));
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+
+	&mov	("esi",&DWP(96,"esp"));	#ctx
+					#&mov	($AH[0],&DWP(0,"esp"));
+	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
+					#&mov	("edi", &DWP(8,"esp"));
+	&mov	("ecx",&DWP(12,"esp"));
+	&add	($AH[0],&DWP(0,"esi"));
+	&add	($AH[1],&DWP(4,"esi"));
+	&add	("edi",&DWP(8,"esi"));
+	&add	("ecx",&DWP(12,"esi"));
+	&mov	(&DWP(0,"esi"),$AH[0]);
+	&mov	(&DWP(4,"esi"),$AH[1]);
+	&mov	(&DWP(8,"esi"),"edi");
+	&mov	(&DWP(12,"esi"),"ecx");
+	 #&mov	(&DWP(0,"esp"),$AH[0]);
+	 &mov	(&DWP(4,"esp"),$AH[1]);
+	 &xor	($AH[1],"edi");			# magic
+	 &mov	(&DWP(8,"esp"),"edi");
+	 &mov	(&DWP(12,"esp"),"ecx");
+	#&mov	($E,&DWP(16,"esp"));
+	&mov	("edi",&DWP(20,"esp"));
+	&mov	("ecx",&DWP(24,"esp"));
+	&add	($E,&DWP(16,"esi"));
+	&add	("edi",&DWP(20,"esi"));
+	&add	("ecx",&DWP(24,"esi"));
+	&mov	(&DWP(16,"esi"),$E);
+	&mov	(&DWP(20,"esi"),"edi");
+	 &mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(28,"esp"));
+	&mov	(&DWP(24,"esi"),"ecx");
+	 #&mov	(&DWP(16,"esp"),$E);
+	&add	("edi",&DWP(28,"esi"));
+	 &mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esi"),"edi");
+	 &mov	(&DWP(28,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+
+	&vmovdqa	($t3,&QWP(64,$K256));
+	&sub	($K256,3*64);			# rewind K
+	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
+	&jb	(&label("grand_avx"));
+
+	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
+	&vzeroall	();
+&function_end_A();
+						if ($avx>1) {
+sub bodyx_00_15 () {			# +10%
+	(
+	'&rorx	("ecx",$E,6)',
+	'&rorx	("esi",$E,11)',
+	 '&mov	(&off($e),$E)',		# save $E, modulo-scheduled
+	'&rorx	("edi",$E,25)',
+	'&xor	("ecx","esi")',
+	 '&andn	("esi",$E,&off($g))',
+	'&xor	("ecx","edi")',		# Sigma1(e)
+	 '&and	($E,&off($f))',
+	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
+	 '&or	($E,"esi")',		# T = Ch(e,f,g)
+
+	'&rorx	("edi",$AH[0],2)',
+	'&rorx	("esi",$AH[0],13)',
+	 '&lea	($E,&DWP(0,$E,"ecx"))',	# T += Sigma1(e)
+	'&rorx	("ecx",$AH[0],22)',
+	'&xor	("esi","edi")',
+	 '&mov	("edi",&off($b))',
+	'&xor	("ecx","esi")',		# Sigma0(a)
+
+	 '&xor	($AH[0],"edi")',	# a ^= b, (b^c) in next round
+	 '&add	($E,&off($h))',		# T += h
+	 '&and	($AH[1],$AH[0])',	# (b^c) &= (a^b)
+	 '&add	($E,&DWP(32+4*($i&15),"esp"))',	# T += K[i]+X[i]
+	 '&xor	($AH[1],"edi")',	# h = Maj(a,b,c) = Ch(a^b,c,b)
+
+	 '&add	("ecx",$E)',		# h += T
+	 '&add	($E,&off($d))',		# d += T
+	'&lea	($AH[1],&DWP(0,$AH[1],"ecx"));'.	# h += Sigma0(a)
+
+	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
+	);
+}
+
+&set_label("AVX_BMI",32);
+	&lea	("esp",&DWP(-96,"esp"));
+	&vzeroall	();
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&mov	($AH[0],&DWP(0,"esi"));
+	&mov	($AH[1],&DWP(4,"esi"));
+	&mov	("ecx",&DWP(8,"esi"));
+	&mov	("edi",&DWP(12,"esi"));
+	#&mov	(&DWP(0,"esp"),$AH[0]);
+	&mov	(&DWP(4,"esp"),$AH[1]);
+	&xor	($AH[1],"ecx");			# magic
+	&mov	(&DWP(8,"esp"),"ecx");
+	&mov	(&DWP(12,"esp"),"edi");
+	&mov	($E,&DWP(16,"esi"));
+	&mov	("edi",&DWP(20,"esi"));
+	&mov	("ecx",&DWP(24,"esi"));
+	&mov	("esi",&DWP(28,"esi"));
+	#&mov	(&DWP(16,"esp"),$E);
+	&mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+	&mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esp"),"esi");
+	&vmovdqa	($t3,&QWP(256,$K256));
+	&jmp	(&label("grand_avx_bmi"));
+
+&set_label("grand_avx_bmi",32);
+	# load input, reverse byte order, add K256[0..15], save to stack
+	&vmovdqu	(@X[0],&QWP(0,"edi"));
+	&vmovdqu	(@X[1],&QWP(16,"edi"));
+	&vmovdqu	(@X[2],&QWP(32,"edi"));
+	&vmovdqu	(@X[3],&QWP(48,"edi"));
+	&add		("edi",64);
+	&vpshufb	(@X[0],@X[0],$t3);
+	&mov		(&DWP(96+4,"esp"),"edi");
+	&vpshufb	(@X[1],@X[1],$t3);
+	&vpshufb	(@X[2],@X[2],$t3);
+	&vpaddd		($t0,@X[0],&QWP(0,$K256));
+	&vpshufb	(@X[3],@X[3],$t3);
+	&vpaddd		($t1,@X[1],&QWP(16,$K256));
+	&vpaddd		($t2,@X[2],&QWP(32,$K256));
+	&vpaddd		($t3,@X[3],&QWP(48,$K256));
+	&vmovdqa	(&QWP(32+0,"esp"),$t0);
+	&vmovdqa	(&QWP(32+16,"esp"),$t1);
+	&vmovdqa	(&QWP(32+32,"esp"),$t2);
+	&vmovdqa	(&QWP(32+48,"esp"),$t3);
+	&jmp		(&label("avx_bmi_00_47"));
+
+&set_label("avx_bmi_00_47",16);
+	&add		($K256,64);
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&AVX_00_47($j,\&bodyx_00_15,@X);
+	push(@X,shift(@X));		# rotate(@X)
+    }
+	&cmp	(&DWP(16*$j,$K256),0x00010203);
+	&jne	(&label("avx_bmi_00_47"));
+
+    for ($i=0; $i<16; ) {
+	foreach(bodyx_00_15()) { eval; }
+    }
+
+	&mov	("esi",&DWP(96,"esp"));	#ctx
+					#&mov	($AH[0],&DWP(0,"esp"));
+	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
+					#&mov	("edi", &DWP(8,"esp"));
+	&mov	("ecx",&DWP(12,"esp"));
+	&add	($AH[0],&DWP(0,"esi"));
+	&add	($AH[1],&DWP(4,"esi"));
+	&add	("edi",&DWP(8,"esi"));
+	&add	("ecx",&DWP(12,"esi"));
+	&mov	(&DWP(0,"esi"),$AH[0]);
+	&mov	(&DWP(4,"esi"),$AH[1]);
+	&mov	(&DWP(8,"esi"),"edi");
+	&mov	(&DWP(12,"esi"),"ecx");
+	 #&mov	(&DWP(0,"esp"),$AH[0]);
+	 &mov	(&DWP(4,"esp"),$AH[1]);
+	 &xor	($AH[1],"edi");			# magic
+	 &mov	(&DWP(8,"esp"),"edi");
+	 &mov	(&DWP(12,"esp"),"ecx");
+	#&mov	($E,&DWP(16,"esp"));
+	&mov	("edi",&DWP(20,"esp"));
+	&mov	("ecx",&DWP(24,"esp"));
+	&add	($E,&DWP(16,"esi"));
+	&add	("edi",&DWP(20,"esi"));
+	&add	("ecx",&DWP(24,"esi"));
+	&mov	(&DWP(16,"esi"),$E);
+	&mov	(&DWP(20,"esi"),"edi");
+	 &mov	(&DWP(20,"esp"),"edi");
+	&mov	("edi",&DWP(28,"esp"));
+	&mov	(&DWP(24,"esi"),"ecx");
+	 #&mov	(&DWP(16,"esp"),$E);
+	&add	("edi",&DWP(28,"esi"));
+	 &mov	(&DWP(24,"esp"),"ecx");
+	&mov	(&DWP(28,"esi"),"edi");
+	 &mov	(&DWP(28,"esp"),"edi");
+	&mov	("edi",&DWP(96+4,"esp"));	# inp
+
+	&vmovdqa	($t3,&QWP(64,$K256));
+	&sub	($K256,3*64);			# rewind K
+	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
+	&jb	(&label("grand_avx_bmi"));
+
+	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
+	&vzeroall	();
+&function_end_A();
+						}
+						}
+						}}}
+&function_end_B("sha256_block_data_order");
+
+&asm_finish();
+
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
new file mode 100644
index 0000000..3eea080
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl

@@ -0,0 +1,733 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code   32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+	sub	$Ktbl,$Ktbl,#256+32
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	b	.Loop_v8
+
+.align	4
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush

diff --git a/crypto/fipsmodule/sha/asm/sha512-586.pl b/crypto/fipsmodule/sha/asm/sha512-586.pl
new file mode 100644
index 0000000..a757ee7
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha512-586.pl

@@ -0,0 +1,918 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 block transform for x86. September 2007.
+#
+# May 2013.
+#
+# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+#		gcc	icc	x86 asm	SIMD(*)	x86_64(**)
+# Pentium	100	97	61	-	-
+# PIII		75	77	56	-	-
+# P4		116	95	82	34.6	30.8
+# AMD K8	54	55	36	20.7	9.57
+# Core2		66	57	40	15.9	9.97
+# Westmere	70	-	38	12.2	9.58
+# Sandy Bridge	58	-	35	11.9	11.2
+# Ivy Bridge	50	-	33	11.5	8.17
+# Haswell	46	-	29	11.3	7.66
+# Skylake	40	-	26	13.3	7.25
+# Bulldozer	121	-	50	14.0	13.5
+# VIA Nano	91	-	52	33	14.7
+# Atom		126	-	68	48(***)	14.7
+# Silvermont	97	-	58	42(***)	17.5
+# Goldmont	80	-	48	19.5	12.0
+#
+# (*)	whichever best applicable.
+# (**)	x86_64 assembler performance is presented for reference
+#	purposes, the results are for integer-only code.
+# (***)	paddq is increadibly slow on Atom.
+#
+# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
+# performance improvement over compiler generated code reaches ~60%,
+# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
+# to 50%, but it's less important as they are expected to execute SSE2
+# code-path, which is commonly ~2-3x faster [than compiler generated
+# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
+# though it does not use 128-bit operations. The latter means that
+# SSE2-aware kernel is no longer required to execute the code. Another
+# difference is that new code optimizes amount of writes, but at the
+# cost of increased data cache "footprint" by 1/2KB.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$Tlo=&DWP(0,"esp");	$Thi=&DWP(4,"esp");
+$Alo=&DWP(8,"esp");	$Ahi=&DWP(8+4,"esp");
+$Blo=&DWP(16,"esp");	$Bhi=&DWP(16+4,"esp");
+$Clo=&DWP(24,"esp");	$Chi=&DWP(24+4,"esp");
+$Dlo=&DWP(32,"esp");	$Dhi=&DWP(32+4,"esp");
+$Elo=&DWP(40,"esp");	$Ehi=&DWP(40+4,"esp");
+$Flo=&DWP(48,"esp");	$Fhi=&DWP(48+4,"esp");
+$Glo=&DWP(56,"esp");	$Ghi=&DWP(56+4,"esp");
+$Hlo=&DWP(64,"esp");	$Hhi=&DWP(64+4,"esp");
+$K512="ebp";
+
+$Asse2=&QWP(0,"esp");
+$Bsse2=&QWP(8,"esp");
+$Csse2=&QWP(16,"esp");
+$Dsse2=&QWP(24,"esp");
+$Esse2=&QWP(32,"esp");
+$Fsse2=&QWP(40,"esp");
+$Gsse2=&QWP(48,"esp");
+$Hsse2=&QWP(56,"esp");
+
+$A="mm0";	# B-D and
+$E="mm4";	# F-H are commonly loaded to respectively mm1-mm3 and
+		# mm5-mm7, but it's done on on-demand basis...
+$BxC="mm2";	# ... except for B^C
+
+sub BODY_00_15_sse2 {
+    my $phase=shift;
+
+	#&movq	("mm5",$Fsse2);			# load f
+	#&movq	("mm6",$Gsse2);			# load g
+
+	&movq	("mm1",$E);			# %mm1 is sliding right
+	 &pxor	("mm5","mm6");			# f^=g
+	&psrlq	("mm1",14);
+	 &movq	($Esse2,$E);			# modulo-scheduled save e
+	 &pand	("mm5",$E);			# f&=e
+	&psllq	($E,23);			# $E is sliding left
+	 &movq	($A,"mm3")			if ($phase<2);
+	 &movq	(&QWP(8*9,"esp"),"mm7")		# save X[i]
+	&movq	("mm3","mm1");			# %mm3 is T1
+	 &psrlq	("mm1",4);
+	 &pxor	("mm5","mm6");			# Ch(e,f,g)
+	&pxor	("mm3",$E);
+	 &psllq	($E,23);
+	&pxor	("mm3","mm1");
+	 &movq	($Asse2,$A);			# modulo-scheduled save a
+	 &paddq	("mm7","mm5");			# X[i]+=Ch(e,f,g)
+	&pxor	("mm3",$E);
+	 &psrlq	("mm1",23);
+	 &paddq	("mm7",$Hsse2);			# X[i]+=h
+	&pxor	("mm3","mm1");
+	 &psllq	($E,4);
+	 &paddq	("mm7",QWP(0,$K512));		# X[i]+=K512[i]
+	&pxor	("mm3",$E);			# T1=Sigma1_512(e)
+
+	 &movq	($E,$Dsse2);			# e = load d, e in next round
+	&paddq	("mm3","mm7");			# T1+=X[i]
+	 &movq	("mm5",$A);			# %mm5 is sliding right
+	 &psrlq	("mm5",28);
+	&paddq	($E,"mm3");			# d += T1
+	 &movq	("mm6",$A);			# %mm6 is sliding left
+	 &movq	("mm7","mm5");
+	 &psllq	("mm6",25);
+	&movq	("mm1",$Bsse2);			# load b
+	 &psrlq	("mm5",6);
+	 &pxor	("mm7","mm6");
+	&sub	("esp",8);
+	 &psllq	("mm6",5);
+	 &pxor	("mm7","mm5");
+	&pxor	($A,"mm1");			# a^b, b^c in next round
+	 &psrlq	("mm5",5);
+	 &pxor	("mm7","mm6");
+	&pand	($BxC,$A);			# (b^c)&(a^b)
+	 &psllq	("mm6",6);
+	 &pxor	("mm7","mm5");
+	&pxor	($BxC,"mm1");			# [h=]Maj(a,b,c)
+	 &pxor	("mm6","mm7");			# Sigma0_512(a)
+	 &movq	("mm7",&QWP(8*(9+16-1),"esp"))	if ($phase!=0);	# pre-fetch
+	 &movq	("mm5",$Fsse2)			if ($phase==0);	# load f
+
+    if ($phase>1) {
+	&paddq	($BxC,"mm6");			# h+=Sigma0(a)
+	 &add	($K512,8);
+	#&paddq	($BxC,"mm3");			# h+=T1
+
+	($A,$BxC) = ($BxC,$A);			# rotate registers
+    } else {
+	&paddq	("mm3",$BxC);			# T1+=Maj(a,b,c)
+	 &movq	($BxC,$A);
+	 &add	($K512,8);
+	&paddq	("mm3","mm6");			# T1+=Sigma0(a)
+	 &movq	("mm6",$Gsse2)			if ($phase==0);	# load g
+	#&movq	($A,"mm3");			# h=T1
+    }
+}
+
+sub BODY_00_15_x86 {
+	#define Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	#	LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	#	HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	&mov	("ecx",$Elo);
+	&mov	("edx",$Ehi);
+	&mov	("esi","ecx");
+
+	&shr	("ecx",9);	# lo>>9
+	&mov	("edi","edx");
+	&shr	("edx",9);	# hi>>9
+	&mov	("ebx","ecx");
+	&shl	("esi",14);	# lo<<14
+	&mov	("eax","edx");
+	&shl	("edi",14);	# hi<<14
+	&xor	("ebx","esi");
+
+	&shr	("ecx",14-9);	# lo>>14
+	&xor	("eax","edi");
+	&shr	("edx",14-9);	# hi>>14
+	&xor	("eax","ecx");
+	&shl	("esi",18-14);	# lo<<18
+	&xor	("ebx","edx");
+	&shl	("edi",18-14);	# hi<<18
+	&xor	("ebx","esi");
+
+	&shr	("ecx",18-14);	# lo>>18
+	&xor	("eax","edi");
+	&shr	("edx",18-14);	# hi>>18
+	&xor	("eax","ecx");
+	&shl	("esi",23-18);	# lo<<23
+	&xor	("ebx","edx");
+	&shl	("edi",23-18);	# hi<<23
+	&xor	("eax","esi");
+	&xor	("ebx","edi");			# T1 = Sigma1(e)
+
+	&mov	("ecx",$Flo);
+	&mov	("edx",$Fhi);
+	&mov	("esi",$Glo);
+	&mov	("edi",$Ghi);
+	 &add	("eax",$Hlo);
+	 &adc	("ebx",$Hhi);			# T1 += h
+	&xor	("ecx","esi");
+	&xor	("edx","edi");
+	&and	("ecx",$Elo);
+	&and	("edx",$Ehi);
+	 &add	("eax",&DWP(8*(9+15)+0,"esp"));
+	 &adc	("ebx",&DWP(8*(9+15)+4,"esp"));	# T1 += X[0]
+	&xor	("ecx","esi");
+	&xor	("edx","edi");			# Ch(e,f,g) = (f^g)&e)^g
+
+	&mov	("esi",&DWP(0,$K512));
+	&mov	("edi",&DWP(4,$K512));		# K[i]
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += Ch(e,f,g)
+	&mov	("ecx",$Dlo);
+	&mov	("edx",$Dhi);
+	&add	("eax","esi");
+	&adc	("ebx","edi");			# T1 += K[i]
+	&mov	($Tlo,"eax");
+	&mov	($Thi,"ebx");			# put T1 away
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# d += T1
+
+	#define Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	#	LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	#	HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	&mov	("ecx",$Alo);
+	&mov	("edx",$Ahi);
+	&mov	($Dlo,"eax");
+	&mov	($Dhi,"ebx");
+	&mov	("esi","ecx");
+
+	&shr	("ecx",2);	# lo>>2
+	&mov	("edi","edx");
+	&shr	("edx",2);	# hi>>2
+	&mov	("ebx","ecx");
+	&shl	("esi",4);	# lo<<4
+	&mov	("eax","edx");
+	&shl	("edi",4);	# hi<<4
+	&xor	("ebx","esi");
+
+	&shr	("ecx",7-2);	# lo>>7
+	&xor	("eax","edi");
+	&shr	("edx",7-2);	# hi>>7
+	&xor	("ebx","ecx");
+	&shl	("esi",25-4);	# lo<<25
+	&xor	("eax","edx");
+	&shl	("edi",25-4);	# hi<<25
+	&xor	("eax","esi");
+
+	&shr	("ecx",28-7);	# lo>>28
+	&xor	("ebx","edi");
+	&shr	("edx",28-7);	# hi>>28
+	&xor	("eax","ecx");
+	&shl	("esi",30-25);	# lo<<30
+	&xor	("ebx","edx");
+	&shl	("edi",30-25);	# hi<<30
+	&xor	("eax","esi");
+	&xor	("ebx","edi");			# Sigma0(a)
+
+	&mov	("ecx",$Alo);
+	&mov	("edx",$Ahi);
+	&mov	("esi",$Blo);
+	&mov	("edi",$Bhi);
+	&add	("eax",$Tlo);
+	&adc	("ebx",$Thi);			# T1 = Sigma0(a)+T1
+	&or	("ecx","esi");
+	&or	("edx","edi");
+	&and	("ecx",$Clo);
+	&and	("edx",$Chi);
+	&and	("esi",$Alo);
+	&and	("edi",$Ahi);
+	&or	("ecx","esi");
+	&or	("edx","edi");			# Maj(a,b,c) = ((a|b)&c)|(a&b)
+
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += Maj(a,b,c)
+	&mov	($Tlo,"eax");
+	&mov	($Thi,"ebx");
+
+	&mov	(&LB("edx"),&BP(0,$K512));	# pre-fetch LSB of *K
+	&sub	("esp",8);
+	&lea	($K512,&DWP(8,$K512));		# K++
+}
+
+
+&function_begin("sha512_block_data_order");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K512);
+	&lea	($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",7);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+if ($sse2) {
+	&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
+	&mov	("ecx",&DWP(0,"edx"));
+	&test	("ecx",1<<26);
+	&jz	(&label("loop_x86"));
+
+	&mov	("edx",&DWP(4,"edx"));
+
+	# load ctx->h[0-7]
+	&movq	($A,&QWP(0,"esi"));
+	 &and	("ecx",1<<24);		# XMM registers availability
+	&movq	("mm1",&QWP(8,"esi"));
+	 &and	("edx",1<<9);		# SSSE3 bit
+	&movq	($BxC,&QWP(16,"esi"));
+	 &or	("ecx","edx");
+	&movq	("mm3",&QWP(24,"esi"));
+	&movq	($E,&QWP(32,"esi"));
+	&movq	("mm5",&QWP(40,"esi"));
+	&movq	("mm6",&QWP(48,"esi"));
+	&movq	("mm7",&QWP(56,"esi"));
+	&cmp	("ecx",1<<24|1<<9);
+	&je	(&label("SSSE3"));
+	&sub	("esp",8*10);
+	&jmp	(&label("loop_sse2"));
+
+&set_label("loop_sse2",16);
+	#&movq	($Asse2,$A);
+	&movq	($Bsse2,"mm1");
+	&movq	($Csse2,$BxC);
+	&movq	($Dsse2,"mm3");
+	#&movq	($Esse2,$E);
+	&movq	($Fsse2,"mm5");
+	&movq	($Gsse2,"mm6");
+	&pxor	($BxC,"mm1");			# magic
+	&movq	($Hsse2,"mm7");
+	&movq	("mm3",$A);			# magic
+
+	&mov	("eax",&DWP(0,"edi"));
+	&mov	("ebx",&DWP(4,"edi"));
+	&add	("edi",8);
+	&mov	("edx",15);			# counter
+	&bswap	("eax");
+	&bswap	("ebx");
+	&jmp	(&label("00_14_sse2"));
+
+&set_label("00_14_sse2",16);
+	&movd	("mm1","eax");
+	&mov	("eax",&DWP(0,"edi"));
+	&movd	("mm7","ebx");
+	&mov	("ebx",&DWP(4,"edi"));
+	&add	("edi",8);
+	&bswap	("eax");
+	&bswap	("ebx");
+	&punpckldq("mm7","mm1");
+
+	&BODY_00_15_sse2();
+
+	&dec	("edx");
+	&jnz	(&label("00_14_sse2"));
+
+	&movd	("mm1","eax");
+	&movd	("mm7","ebx");
+	&punpckldq("mm7","mm1");
+
+	&BODY_00_15_sse2(1);
+
+	&pxor	($A,$A);			# A is in %mm3
+	&mov	("edx",32);			# counter
+	&jmp	(&label("16_79_sse2"));
+
+&set_label("16_79_sse2",16);
+    for ($j=0;$j<2;$j++) {			# 2x unroll
+	#&movq	("mm7",&QWP(8*(9+16-1),"esp"));	# prefetched in BODY_00_15
+	&movq	("mm5",&QWP(8*(9+16-14),"esp"));
+	&movq	("mm1","mm7");
+	&psrlq	("mm7",1);
+	 &movq	("mm6","mm5");
+	 &psrlq	("mm5",6);
+	&psllq	("mm1",56);
+	 &paddq	($A,"mm3");			# from BODY_00_15
+	 &movq	("mm3","mm7");
+	&psrlq	("mm7",7-1);
+	 &pxor	("mm3","mm1");
+	 &psllq	("mm1",63-56);
+	&pxor	("mm3","mm7");
+	 &psrlq	("mm7",8-7);
+	&pxor	("mm3","mm1");
+	 &movq	("mm1","mm5");
+	 &psrlq	("mm5",19-6);
+	&pxor	("mm7","mm3");			# sigma0
+
+	 &psllq	("mm6",3);
+	 &pxor	("mm1","mm5");
+	&paddq	("mm7",&QWP(8*(9+16),"esp"));
+	 &pxor	("mm1","mm6");
+	 &psrlq	("mm5",61-19);
+	&paddq	("mm7",&QWP(8*(9+16-9),"esp"));
+	 &pxor	("mm1","mm5");
+	 &psllq	("mm6",45-3);
+	&movq	("mm5",$Fsse2);			# load f
+	 &pxor	("mm1","mm6");			# sigma1
+	&movq	("mm6",$Gsse2);			# load g
+
+	&paddq	("mm7","mm1");			# X[i]
+	#&movq	(&QWP(8*9,"esp"),"mm7");	# moved to BODY_00_15
+
+	&BODY_00_15_sse2(2);
+    }
+	&dec	("edx");
+	&jnz	(&label("16_79_sse2"));
+
+	#&movq	($A,$Asse2);
+	&paddq	($A,"mm3");			# from BODY_00_15
+	&movq	("mm1",$Bsse2);
+	#&movq	($BxC,$Csse2);
+	&movq	("mm3",$Dsse2);
+	#&movq	($E,$Esse2);
+	&movq	("mm5",$Fsse2);
+	&movq	("mm6",$Gsse2);
+	&movq	("mm7",$Hsse2);
+
+	&pxor	($BxC,"mm1");			# de-magic
+	&paddq	($A,&QWP(0,"esi"));
+	&paddq	("mm1",&QWP(8,"esi"));
+	&paddq	($BxC,&QWP(16,"esi"));
+	&paddq	("mm3",&QWP(24,"esi"));
+	&paddq	($E,&QWP(32,"esi"));
+	&paddq	("mm5",&QWP(40,"esi"));
+	&paddq	("mm6",&QWP(48,"esi"));
+	&paddq	("mm7",&QWP(56,"esi"));
+
+	&mov	("eax",8*80);
+	&movq	(&QWP(0,"esi"),$A);
+	&movq	(&QWP(8,"esi"),"mm1");
+	&movq	(&QWP(16,"esi"),$BxC);
+	&movq	(&QWP(24,"esi"),"mm3");
+	&movq	(&QWP(32,"esi"),$E);
+	&movq	(&QWP(40,"esi"),"mm5");
+	&movq	(&QWP(48,"esi"),"mm6");
+	&movq	(&QWP(56,"esi"),"mm7");
+
+	&lea	("esp",&DWP(0,"esp","eax"));	# destroy frame
+	&sub	($K512,"eax");			# rewind K
+
+	&cmp	("edi",&DWP(8*10+8,"esp"));	# are we done yet?
+	&jb	(&label("loop_sse2"));
+
+	&mov	("esp",&DWP(8*10+12,"esp"));	# restore sp
+	&emms	();
+&function_end_A();
+
+&set_label("SSSE3",32);
+{ my ($cnt,$frame)=("ecx","edx");
+  my @X=map("xmm$_",(0..7));
+  my $j;
+  my $i=0;
+
+	&lea	($frame,&DWP(-64,"esp"));
+	&sub	("esp",256);
+
+	# fixed stack frame layout
+	#
+	# +0	A B C D E F G H		# backing store
+	# +64	X[0]+K[i] .. X[15]+K[i]	# XMM->MM xfer area
+	# +192				# XMM off-load ring buffer
+	# +256				# saved parameters
+
+	&movdqa		(@X[1],&QWP(80*8,$K512));		# byte swap mask
+	&movdqu		(@X[0],&QWP(0,"edi"));
+	&pshufb		(@X[0],@X[1]);
+    for ($j=0;$j<8;$j++) {
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3])	if ($j>4); # off-load
+	&movdqa		(@X[3],&QWP(16*($j%8),$K512));
+	&movdqa		(@X[2],@X[1])				if ($j<7); # perpetuate byte swap mask
+	&movdqu		(@X[1],&QWP(16*($j+1),"edi"))		if ($j<7); # next input
+	&movdqa		(@X[1],&QWP(16*(($j+1)%4),$frame))	if ($j==7);# restore @X[0]
+	&paddq		(@X[3],@X[0]);
+	&pshufb		(@X[1],@X[2])				if ($j<7);
+	&movdqa		(&QWP(16*($j%8)-128,$frame),@X[3]);	# xfer X[i]+K[i]
+
+	push(@X,shift(@X));					# rotate(@X)
+    }
+	#&jmp		(&label("loop_ssse3"));
+	&nop		();
+
+&set_label("loop_ssse3",32);
+	&movdqa		(@X[2],&QWP(16*(($j+1)%4),$frame));	# pre-restore @X[1]
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3]);	# off-load @X[3]
+	&lea		($K512,&DWP(16*8,$K512));
+
+	#&movq	($Asse2,$A);			# off-load A-H
+	&movq	($Bsse2,"mm1");
+	 &mov	("ebx","edi");
+	&movq	($Csse2,$BxC);
+	 &lea	("edi",&DWP(128,"edi"));	# advance input
+	&movq	($Dsse2,"mm3");
+	 &cmp	("edi","eax");
+	#&movq	($Esse2,$E);
+	&movq	($Fsse2,"mm5");
+	 &cmovb	("ebx","edi");
+	&movq	($Gsse2,"mm6");
+	 &mov	("ecx",4);			# loop counter
+	&pxor	($BxC,"mm1");			# magic
+	&movq	($Hsse2,"mm7");
+	&pxor	("mm3","mm3");			# magic
+
+	&jmp		(&label("00_47_ssse3"));
+
+sub BODY_00_15_ssse3 {		# "phase-less" copy of BODY_00_15_sse2
+	(
+	'&movq	("mm1",$E)',				# %mm1 is sliding right
+	'&movq	("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
+	 '&pxor	("mm5","mm6")',				# f^=g
+	'&psrlq	("mm1",14)',
+	 '&movq	(&QWP(8*($i+4)%64,"esp"),$E)',		# modulo-scheduled save e
+	 '&pand	("mm5",$E)',				# f&=e
+	'&psllq	($E,23)',				# $E is sliding left
+	'&paddq	($A,"mm3")',				# [h+=Maj(a,b,c)]
+	'&movq	("mm3","mm1")',				# %mm3 is T1
+	 '&psrlq("mm1",4)',
+	 '&pxor	("mm5","mm6")',				# Ch(e,f,g)
+	'&pxor	("mm3",$E)',
+	 '&psllq($E,23)',
+	'&pxor	("mm3","mm1")',
+	 '&movq	(&QWP(8*$i%64,"esp"),$A)',		# modulo-scheduled save a
+	 '&paddq("mm7","mm5")',				# X[i]+=Ch(e,f,g)
+	'&pxor	("mm3",$E)',
+	 '&psrlq("mm1",23)',
+	 '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))',	# X[i]+=h
+	'&pxor	("mm3","mm1")',
+	 '&psllq($E,4)',
+	'&pxor	("mm3",$E)',				# T1=Sigma1_512(e)
+
+	 '&movq	($E,&QWP(8*($i+3)%64,"esp"))',		# e = load d, e in next round
+	'&paddq	("mm3","mm7")',				# T1+=X[i]
+	 '&movq	("mm5",$A)',				# %mm5 is sliding right
+	 '&psrlq("mm5",28)',
+	'&paddq	($E,"mm3")',				# d += T1
+	 '&movq	("mm6",$A)',				# %mm6 is sliding left
+	 '&movq	("mm7","mm5")',
+	 '&psllq("mm6",25)',
+	'&movq	("mm1",&QWP(8*($i+1)%64,"esp"))',	# load b
+	 '&psrlq("mm5",6)',
+	 '&pxor	("mm7","mm6")',
+	 '&psllq("mm6",5)',
+	 '&pxor	("mm7","mm5")',
+	'&pxor	($A,"mm1")',				# a^b, b^c in next round
+	 '&psrlq("mm5",5)',
+	 '&pxor	("mm7","mm6")',
+	'&pand	($BxC,$A)',				# (b^c)&(a^b)
+	 '&psllq("mm6",6)',
+	 '&pxor	("mm7","mm5")',
+	'&pxor	($BxC,"mm1")',				# [h=]Maj(a,b,c)
+	 '&pxor	("mm6","mm7")',				# Sigma0_512(a)
+	 '&movq	("mm5",&QWP(8*($i+5-1)%64,"esp"))',	# pre-load f
+	'&paddq	($BxC,"mm6")',				# h+=Sigma0(a)
+	 '&movq	("mm6",&QWP(8*($i+6-1)%64,"esp"))',	# pre-load g
+
+	'($A,$BxC) = ($BxC,$A); $i--;'
+	);
+}
+
+&set_label("00_47_ssse3",32);
+
+    for(;$j<16;$j++) {
+	my ($t0,$t2,$t1)=@X[2..4];
+	my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+	&movdqa		($t2,@X[5]);
+	&movdqa		(@X[1],$t0);			# restore @X[1]
+	&palignr	($t0,@X[0],8);			# X[1..2]
+	&movdqa		(&QWP(16*($j%4),$frame),@X[4]);	# off-load @X[4]
+	 &palignr	($t2,@X[4],8);			# X[9..10]
+
+	&movdqa		($t1,$t0);
+	&psrlq		($t0,7);
+	 &paddq		(@X[0],$t2);			# X[0..1] += X[9..10]
+	&movdqa		($t2,$t1);
+	&psrlq		($t1,1);
+	&psllq		($t2,64-8);
+	&pxor		($t0,$t1);
+	&psrlq		($t1,8-1);
+	&pxor		($t0,$t2);
+	&psllq		($t2,8-1);
+	&pxor		($t0,$t1);
+	 &movdqa	($t1,@X[7]);
+	&pxor		($t0,$t2);			# sigma0(X[1..2])
+	 &movdqa	($t2,@X[7]);
+	 &psrlq		($t1,6);
+	&paddq		(@X[0],$t0);			# X[0..1] += sigma0(X[1..2])
+
+	&movdqa		($t0,@X[7]);
+	&psrlq		($t2,19);
+	&psllq		($t0,64-61);
+	&pxor		($t1,$t2);
+	&psrlq		($t2,61-19);
+	&pxor		($t1,$t0);
+	&psllq		($t0,61-19);
+	&pxor		($t1,$t2);
+	&movdqa		($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
+	&pxor		($t1,$t0);			# sigma0(X[1..2])
+	&movdqa		($t0,&QWP(16*($j%8),$K512));
+	 eval(shift(@insns));
+	&paddq		(@X[0],$t1);			# X[0..1] += sigma0(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddq		($t0,@X[0]);
+	 foreach(@insns) { eval; }
+	&movdqa		(&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
+
+	push(@X,shift(@X));				# rotate(@X)
+    }
+	&lea		($K512,&DWP(16*8,$K512));
+	&dec		("ecx");
+	&jnz		(&label("00_47_ssse3"));
+
+	&movdqa		(@X[1],&QWP(0,$K512));		# byte swap mask
+	&lea		($K512,&DWP(-80*8,$K512));	# rewind
+	&movdqu		(@X[0],&QWP(0,"ebx"));
+	&pshufb		(@X[0],@X[1]);
+
+    for ($j=0;$j<8;$j++) {	# load next or same block
+	my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3])	if ($j>4); # off-load
+	&movdqa		(@X[3],&QWP(16*($j%8),$K512));
+	&movdqa		(@X[2],@X[1])				if ($j<7); # perpetuate byte swap mask
+	&movdqu		(@X[1],&QWP(16*($j+1),"ebx"))		if ($j<7); # next input
+	&movdqa		(@X[1],&QWP(16*(($j+1)%4),$frame))	if ($j==7);# restore @X[0]
+	&paddq		(@X[3],@X[0]);
+	&pshufb		(@X[1],@X[2])				if ($j<7);
+	 foreach(@insns) { eval; }
+	&movdqa		(&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
+
+	push(@X,shift(@X));				# rotate(@X)
+    }
+
+	#&movq	($A,$Asse2);			# load A-H
+	&movq	("mm1",$Bsse2);
+	&paddq	($A,"mm3");			# from BODY_00_15
+	#&movq	($BxC,$Csse2);
+	&movq	("mm3",$Dsse2);
+	#&movq	($E,$Esse2);
+	#&movq	("mm5",$Fsse2);
+	#&movq	("mm6",$Gsse2);
+	&movq	("mm7",$Hsse2);
+
+	&pxor	($BxC,"mm1");			# de-magic
+	&paddq	($A,&QWP(0,"esi"));
+	&paddq	("mm1",&QWP(8,"esi"));
+	&paddq	($BxC,&QWP(16,"esi"));
+	&paddq	("mm3",&QWP(24,"esi"));
+	&paddq	($E,&QWP(32,"esi"));
+	&paddq	("mm5",&QWP(40,"esi"));
+	&paddq	("mm6",&QWP(48,"esi"));
+	&paddq	("mm7",&QWP(56,"esi"));
+
+	&movq	(&QWP(0,"esi"),$A);
+	&movq	(&QWP(8,"esi"),"mm1");
+	&movq	(&QWP(16,"esi"),$BxC);
+	&movq	(&QWP(24,"esi"),"mm3");
+	&movq	(&QWP(32,"esi"),$E);
+	&movq	(&QWP(40,"esi"),"mm5");
+	&movq	(&QWP(48,"esi"),"mm6");
+	&movq	(&QWP(56,"esi"),"mm7");
+
+    	&cmp	("edi","eax")			# are we done yet?
+	&jb	(&label("loop_ssse3"));
+
+	&mov	("esp",&DWP(64+12,$frame));	# restore sp
+	&emms	();
+}
+&function_end_A();
+}
+&set_label("loop_x86",16);
+    # copy input block to stack reversing byte and qword order
+    for ($i=0;$i<8;$i++) {
+	&mov	("eax",&DWP($i*16+0,"edi"));
+	&mov	("ebx",&DWP($i*16+4,"edi"));
+	&mov	("ecx",&DWP($i*16+8,"edi"));
+	&mov	("edx",&DWP($i*16+12,"edi"));
+	&bswap	("eax");
+	&bswap	("ebx");
+	&bswap	("ecx");
+	&bswap	("edx");
+	&push	("eax");
+	&push	("ebx");
+	&push	("ecx");
+	&push	("edx");
+    }
+	&add	("edi",128);
+	&sub	("esp",9*8);		# place for T,A,B,C,D,E,F,G,H
+	&mov	(&DWP(8*(9+16)+4,"esp"),"edi");
+
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&lea	("edi",&DWP(8,"esp"));
+	&mov	("ecx",16);
+	&data_word(0xA5F3F689);		# rep movsd
+
+&set_label("00_15_x86",16);
+	&BODY_00_15_x86();
+
+	&cmp	(&LB("edx"),0x94);
+	&jne	(&label("00_15_x86"));
+
+&set_label("16_79_x86",16);
+	#define sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	#	LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	#	HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	&mov	("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16-1)+4,"esp"));
+	&mov	("esi","ecx");
+
+	&shr	("ecx",1);	# lo>>1
+	&mov	("edi","edx");
+	&shr	("edx",1);	# hi>>1
+	&mov	("eax","ecx");
+	&shl	("esi",24);	# lo<<24
+	&mov	("ebx","edx");
+	&shl	("edi",24);	# hi<<24
+	&xor	("ebx","esi");
+
+	&shr	("ecx",7-1);	# lo>>7
+	&xor	("eax","edi");
+	&shr	("edx",7-1);	# hi>>7
+	&xor	("eax","ecx");
+	&shl	("esi",31-24);	# lo<<31
+	&xor	("ebx","edx");
+	&shl	("edi",25-24);	# hi<<25
+	&xor	("ebx","esi");
+
+	&shr	("ecx",8-7);	# lo>>8
+	&xor	("eax","edi");
+	&shr	("edx",8-7);	# hi>>8
+	&xor	("eax","ecx");
+	&shl	("edi",31-25);	# hi<<31
+	&xor	("ebx","edx");
+	&xor	("eax","edi");			# T1 = sigma0(X[-15])
+
+	&mov	(&DWP(0,"esp"),"eax");
+	&mov	(&DWP(4,"esp"),"ebx");		# put T1 away
+
+	#define sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	#	LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	#	HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	&mov	("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16-14)+4,"esp"));
+	&mov	("esi","ecx");
+
+	&shr	("ecx",6);	# lo>>6
+	&mov	("edi","edx");
+	&shr	("edx",6);	# hi>>6
+	&mov	("eax","ecx");
+	&shl	("esi",3);	# lo<<3
+	&mov	("ebx","edx");
+	&shl	("edi",3);	# hi<<3
+	&xor	("eax","esi");
+
+	&shr	("ecx",19-6);	# lo>>19
+	&xor	("ebx","edi");
+	&shr	("edx",19-6);	# hi>>19
+	&xor	("eax","ecx");
+	&shl	("esi",13-3);	# lo<<13
+	&xor	("ebx","edx");
+	&shl	("edi",13-3);	# hi<<13
+	&xor	("ebx","esi");
+
+	&shr	("ecx",29-19);	# lo>>29
+	&xor	("eax","edi");
+	&shr	("edx",29-19);	# hi>>29
+	&xor	("ebx","ecx");
+	&shl	("edi",26-13);	# hi<<26
+	&xor	("eax","edx");
+	&xor	("eax","edi");			# sigma1(X[-2])
+
+	&mov	("ecx",&DWP(8*(9+15+16)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16)+4,"esp"));
+	&add	("eax",&DWP(0,"esp"));
+	&adc	("ebx",&DWP(4,"esp"));		# T1 = sigma1(X[-2])+T1
+	&mov	("esi",&DWP(8*(9+15+16-9)+0,"esp"));
+	&mov	("edi",&DWP(8*(9+15+16-9)+4,"esp"));
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += X[-16]
+	&add	("eax","esi");
+	&adc	("ebx","edi");			# T1 += X[-7]
+	&mov	(&DWP(8*(9+15)+0,"esp"),"eax");
+	&mov	(&DWP(8*(9+15)+4,"esp"),"ebx");	# save X[0]
+
+	&BODY_00_15_x86();
+
+	&cmp	(&LB("edx"),0x17);
+	&jne	(&label("16_79_x86"));
+
+	&mov	("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
+	&mov	("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
+    for($i=0;$i<4;$i++) {
+	&mov	("eax",&DWP($i*16+0,"esi"));
+	&mov	("ebx",&DWP($i*16+4,"esi"));
+	&mov	("ecx",&DWP($i*16+8,"esi"));
+	&mov	("edx",&DWP($i*16+12,"esi"));
+	&add	("eax",&DWP(8+($i*16)+0,"esp"));
+	&adc	("ebx",&DWP(8+($i*16)+4,"esp"));
+	&mov	(&DWP($i*16+0,"esi"),"eax");
+	&mov	(&DWP($i*16+4,"esi"),"ebx");
+	&add	("ecx",&DWP(8+($i*16)+8,"esp"));
+	&adc	("edx",&DWP(8+($i*16)+12,"esp"));
+	&mov	(&DWP($i*16+8,"esi"),"ecx");
+	&mov	(&DWP($i*16+12,"esi"),"edx");
+    }
+	&add	("esp",8*(9+16+80));		# destroy frame
+	&sub	($K512,8*80);			# rewind K
+
+	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
+	&jb	(&label("loop_x86"));
+
+	&mov	("esp",&DWP(12,"esp"));		# restore sp
+&function_end_A();
+
+&set_label("K512",64);	# Yes! I keep it in the code segment!
+	&data_word(0xd728ae22,0x428a2f98);	# u64
+	&data_word(0x23ef65cd,0x71374491);	# u64
+	&data_word(0xec4d3b2f,0xb5c0fbcf);	# u64
+	&data_word(0x8189dbbc,0xe9b5dba5);	# u64
+	&data_word(0xf348b538,0x3956c25b);	# u64
+	&data_word(0xb605d019,0x59f111f1);	# u64
+	&data_word(0xaf194f9b,0x923f82a4);	# u64
+	&data_word(0xda6d8118,0xab1c5ed5);	# u64
+	&data_word(0xa3030242,0xd807aa98);	# u64
+	&data_word(0x45706fbe,0x12835b01);	# u64
+	&data_word(0x4ee4b28c,0x243185be);	# u64
+	&data_word(0xd5ffb4e2,0x550c7dc3);	# u64
+	&data_word(0xf27b896f,0x72be5d74);	# u64
+	&data_word(0x3b1696b1,0x80deb1fe);	# u64
+	&data_word(0x25c71235,0x9bdc06a7);	# u64
+	&data_word(0xcf692694,0xc19bf174);	# u64
+	&data_word(0x9ef14ad2,0xe49b69c1);	# u64
+	&data_word(0x384f25e3,0xefbe4786);	# u64
+	&data_word(0x8b8cd5b5,0x0fc19dc6);	# u64
+	&data_word(0x77ac9c65,0x240ca1cc);	# u64
+	&data_word(0x592b0275,0x2de92c6f);	# u64
+	&data_word(0x6ea6e483,0x4a7484aa);	# u64
+	&data_word(0xbd41fbd4,0x5cb0a9dc);	# u64
+	&data_word(0x831153b5,0x76f988da);	# u64
+	&data_word(0xee66dfab,0x983e5152);	# u64
+	&data_word(0x2db43210,0xa831c66d);	# u64
+	&data_word(0x98fb213f,0xb00327c8);	# u64
+	&data_word(0xbeef0ee4,0xbf597fc7);	# u64
+	&data_word(0x3da88fc2,0xc6e00bf3);	# u64
+	&data_word(0x930aa725,0xd5a79147);	# u64
+	&data_word(0xe003826f,0x06ca6351);	# u64
+	&data_word(0x0a0e6e70,0x14292967);	# u64
+	&data_word(0x46d22ffc,0x27b70a85);	# u64
+	&data_word(0x5c26c926,0x2e1b2138);	# u64
+	&data_word(0x5ac42aed,0x4d2c6dfc);	# u64
+	&data_word(0x9d95b3df,0x53380d13);	# u64
+	&data_word(0x8baf63de,0x650a7354);	# u64
+	&data_word(0x3c77b2a8,0x766a0abb);	# u64
+	&data_word(0x47edaee6,0x81c2c92e);	# u64
+	&data_word(0x1482353b,0x92722c85);	# u64
+	&data_word(0x4cf10364,0xa2bfe8a1);	# u64
+	&data_word(0xbc423001,0xa81a664b);	# u64
+	&data_word(0xd0f89791,0xc24b8b70);	# u64
+	&data_word(0x0654be30,0xc76c51a3);	# u64
+	&data_word(0xd6ef5218,0xd192e819);	# u64
+	&data_word(0x5565a910,0xd6990624);	# u64
+	&data_word(0x5771202a,0xf40e3585);	# u64
+	&data_word(0x32bbd1b8,0x106aa070);	# u64
+	&data_word(0xb8d2d0c8,0x19a4c116);	# u64
+	&data_word(0x5141ab53,0x1e376c08);	# u64
+	&data_word(0xdf8eeb99,0x2748774c);	# u64
+	&data_word(0xe19b48a8,0x34b0bcb5);	# u64
+	&data_word(0xc5c95a63,0x391c0cb3);	# u64
+	&data_word(0xe3418acb,0x4ed8aa4a);	# u64
+	&data_word(0x7763e373,0x5b9cca4f);	# u64
+	&data_word(0xd6b2b8a3,0x682e6ff3);	# u64
+	&data_word(0x5defb2fc,0x748f82ee);	# u64
+	&data_word(0x43172f60,0x78a5636f);	# u64
+	&data_word(0xa1f0ab72,0x84c87814);	# u64
+	&data_word(0x1a6439ec,0x8cc70208);	# u64
+	&data_word(0x23631e28,0x90befffa);	# u64
+	&data_word(0xde82bde9,0xa4506ceb);	# u64
+	&data_word(0xb2c67915,0xbef9a3f7);	# u64
+	&data_word(0xe372532b,0xc67178f2);	# u64
+	&data_word(0xea26619c,0xca273ece);	# u64
+	&data_word(0x21c0c207,0xd186b8c7);	# u64
+	&data_word(0xcde0eb1e,0xeada7dd6);	# u64
+	&data_word(0xee6ed178,0xf57d4f7f);	# u64
+	&data_word(0x72176fba,0x06f067aa);	# u64
+	&data_word(0xa2c898a6,0x0a637dc5);	# u64
+	&data_word(0xbef90dae,0x113f9804);	# u64
+	&data_word(0x131c471b,0x1b710b35);	# u64
+	&data_word(0x23047d84,0x28db77f5);	# u64
+	&data_word(0x40c72493,0x32caab7b);	# u64
+	&data_word(0x15c9bebc,0x3c9ebe0a);	# u64
+	&data_word(0x9c100d4c,0x431d67c4);	# u64
+	&data_word(0xcb3e42b6,0x4cc5d4be);	# u64
+	&data_word(0xfc657e2a,0x597f299c);	# u64
+	&data_word(0x3ad6faec,0x5fcb6fab);	# u64
+	&data_word(0x4a475817,0x6c44198c);	# u64
+
+	&data_word(0x04050607,0x00010203);	# byte swap
+	&data_word(0x0c0d0e0f,0x08090a0b);	# mask
+&function_end_B("sha512_block_data_order");
+&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
new file mode 100644
index 0000000..fd92f7a
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl

@@ -0,0 +1,666 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7 || defined(__APPLE__)
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	$Ktbl,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	adr	$Ktbl,K512
+	VFP_ABI_PUSH
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush

diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
new file mode 100644
index 0000000..b4e558a
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl

@@ -0,0 +1,430 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
+#
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significanty faster
+#	and the gap is only 40-90%.
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__ARMEB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#include <openssl/arm_arch.h>
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+___
+$code.=<<___	if ($SZ==4);
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+___
+$code.=<<___;
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adr	$Ktbl,.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+.size	$func,.-$func
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+.align	3
+.LOPENSSL_armcap_P:
+	.quad	OPENSSL_armcap_P-.
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+___
+}
+
+$code.=<<___;
+.comm	OPENSSL_armcap_P,4,4
+___
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
new file mode 100755
index 0000000..c638595
--- /dev/null
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl

@@ -0,0 +1,2393 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# sha256/512_block procedure for x86_64.
+#
+# 40% improvement over compiler-generated code on Opteron. On EM64T
+# sha256 was observed to run >80% faster and sha512 - >40%. No magical
+# tricks, just straight implementation... I really wonder why gcc
+# [being armed with inline assembler] fails to generate as fast code.
+# The only thing which is cool about this module is that it's very
+# same instruction sequence used for both SHA-256 and SHA-512. In
+# former case the instructions operate on 32-bit operands, while in
+# latter - on 64-bit ones. All I had to do is to get one flavor right,
+# the other one passed the test right away:-)
+#
+# sha256_block runs in ~1005 cycles on Opteron, which gives you
+# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+# frequency in GHz. sha512_block runs in ~1275 cycles, which results
+# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+# Well, if you compare it to IA-64 implementation, which maintains
+# X[16] in register bank[!], tends to 4 instructions per CPU clock
+# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+# issue Opteron pipeline and X[16] maintained in memory. So that *if*
+# there is a way to improve it, *then* the only way would be to try to
+# offload X[16] updates to SSE unit, but that would require "deeper"
+# loop unroll, which in turn would naturally cause size blow-up, not
+# to mention increased complexity! And once again, only *if* it's
+# actually possible to noticeably improve overall ILP, instruction
+# level parallelism, on a given CPU implementation in this case.
+#
+# Special note on Intel EM64T. While Opteron CPU exhibits perfect
+# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+# [currently available] EM64T CPUs apparently are far from it. On the
+# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+# sha256_block:-( This is presumably because 64-bit shifts/rotates
+# apparently are not atomic instructions, but implemented in microcode.
+#
+# May 2012.
+#
+# Optimization including one of Pavel Semjanov's ideas, alternative
+# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
+# unfortunately -2% SHA512 on P4 [which nobody should care about
+# that much].
+#
+# June 2012.
+#
+# Add SIMD code paths, see below for improvement coefficients. SSSE3
+# code path was not attempted for SHA512, because improvement is not
+# estimated to be high enough, noticeably less than 9%, to justify
+# the effort, not on pre-AVX processors. [Obviously with exclusion
+# for VIA Nano, but it has SHA512 instruction that is faster and
+# should be used instead.] For reference, corresponding estimated
+# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
+# higher coefficients are observed on VIA Nano and Bulldozer has more
+# to do with specifics of their architecture [which is topic for
+# separate discussion].
+#
+# November 2012.
+#
+# Add AVX2 code path. Two consecutive input blocks are loaded to
+# 256-bit %ymm registers, with data from first block to least
+# significant 128-bit halves and data from second to most significant.
+# The data is then processed with same SIMD instruction sequence as
+# for AVX, but with %ymm as operands. Side effect is increased stack
+# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
+# code size increase.
+#
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
+######################################################################
+# Current performance in cycles per processed byte (less is better):
+#
+#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
+#
+# AMD K8	14.9	-	    -		    9.57    -
+# P4		17.3	-	    -		    30.8    -
+# Core 2	15.6	13.8(+13%)  -		    9.97    -
+# Westmere	14.8	12.3(+19%)  -		    9.58    -
+# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
+# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
+# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
+# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
+# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
+# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
+# Atom		23.0	18.9(+22%)  -		    14.7    -
+# Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
+#
+# (*)	whichever best applicable, including SHAEXT;
+# (**)	switch from ror to shrd stands for fair share of improvement;
+# (***)	execution time is fully determined by remaining integer-only
+#	part, body_00_15; reducing the amount of SIMD instructions
+#	below certain limit makes no difference/sense; to conserve
+#	space SHA256 XOP code path is therefore omitted;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+#
+# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
+# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
+# did not tie them together until after $shaext was added.
+$avx = 1;
+
+# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
+# been tested.
+$shaext=0;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+if ($output =~ /512/) {
+	$func="sha512_block_data_order";
+	$TABLE="K512";
+	$SZ=8;
+	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
+					"%r8", "%r9", "%r10","%r11");
+	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+} else {
+	$func="sha256_block_data_order";
+	$TABLE="K256";
+	$SZ=4;
+	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+					"%r8d","%r9d","%r10d","%r11d");
+	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+}
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$_rsp="16*$SZ+3*8(%rsp)";
+$framesz="16*$SZ+4*8";
+
+
+sub ROUND_00_15()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+  my $STRIDE=$SZ;
+     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
+
+$code.=<<___;
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
+	mov	$f,$a2
+
+	xor	$e,$a0
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$g,$a2			# f^g
+
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	xor	$a,$a1
+	and	$e,$a2			# (f^g)&e
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+
+	mov	$a,$a2
+	add	($Tbl),$T1		# T1+=K[round]
+	xor	$a,$a1
+
+	xor	$b,$a2			# a^b, b^c in next round
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	mov	$b,$h
+
+	and	$a2,$a3
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+
+	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
+	add	$T1,$d			# d+=T1
+	add	$T1,$h			# h+=T1
+
+	lea	$STRIDE($Tbl),$Tbl	# round++
+___
+$code.=<<___ if ($i<15);
+	add	$a1,$h			# h+=Sigma0(a)
+___
+	($a2,$a3) = ($a3,$a2);
+}
+
+sub ROUND_16_XX()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
+
+	mov	$a0,$T1
+	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
+	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
+	mov	$a2,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+
+	xor	$T1,$a0
+	shr	\$$sigma0[2],$T1
+	ror	\$$sigma0[0],$a0
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
+
+	ror	\$$sigma1[0],$a2
+	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
+	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
+	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+
+	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a2,$T1
+	mov	$a,$a1
+___
+	&ROUND_00_15(@_);
+}
+
+$code=<<___;
+.text
+
+.extern	OPENSSL_ia32cap_addr
+.globl	$func
+.type	$func,\@function,3
+.align	16
+$func:
+___
+$code.=<<___ if ($SZ==4 || $avx);
+	lea	OPENSSL_ia32cap_addr(%rip),%r11
+	mov	(%r11),%r11
+	mov	0(%r11),%r9d
+	mov	4(%r11),%r10d
+	mov	8(%r11),%r11d
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+	test	\$`1<<29`,%r11d		# check for SHA
+	jnz	_shaext_shortcut
+___
+$code.=<<___ if ($avx && $SZ==8);
+	test	\$`1<<11`,%r10d		# check for XOP
+	jnz	.Lxop_shortcut
+___
+$code.=<<___ if ($avx>1);
+	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
+	cmp	\$`1<<8|1<<5|1<<3`,%r11d
+	je	.Lavx2_shortcut
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
+	or	%r9d,%r10d
+	cmp	\$`1<<28|1<<9|1<<30`,%r10d
+	je	.Lavx_shortcut
+___
+$code.=<<___ if ($SZ==4);
+	test	\$`1<<9`,%r10d
+	jnz	.Lssse3_shortcut
+___
+$code.=<<___;
+	mov	%rsp,%rax		# copy %rsp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+.Lprologue:
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	mov	$B,$a3
+	lea	$TABLE(%rip),$Tbl
+	xor	$C,$a3			# magic
+___
+	for($i=0;$i<16;$i++) {
+		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
+		$code.="	bswap	$T1\n";
+		&ROUND_00_15($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+$code.=<<___;
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+___
+	for(;$i<32;$i++) {
+		&ROUND_16_XX($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+
+$code.=<<___;
+	cmpb	\$0,`$SZ-1`($Tbl)
+	jnz	.Lrounds_16_xx
+
+	mov	$_ctx,$ctx
+	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
+	lea	16*$SZ($inp),$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop
+
+	mov	$_rsp,%rsi
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	$func,.-$func
+___
+
+if ($SZ==4) {
+$code.=<<___;
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+} else {
+$code.=<<___;
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+}
+
+######################################################################
+# SIMD code paths
+#
+if ($SZ==4 && $shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type	sha256_block_data_order_shaext,\@function,3
+.align	64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+	lea	`-8-5*16`(%rsp),%rsp
+	movaps	%xmm6,-8-5*16(%rax)
+	movaps	%xmm7,-8-4*16(%rax)
+	movaps	%xmm8,-8-3*16(%rax)
+	movaps	%xmm9,-8-2*16(%rax)
+	movaps	%xmm10,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+	jmp		.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+	nop
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+	lea		0x40($inp),$inp
+	sha256msg1	@MSG[1],@MSG[0]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	nop
+	paddd		$TMP,@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*32-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	nop
+	paddd		$TMP,@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+	movdqa		$i*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	nop
+	paddd		$TMP,@MSG[2]
+	sha256msg1	@MSG[0],@MSG[3]
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	sha256rnds2	$CDGH,$ABEF
+	paddd		$TMP,@MSG[2]
+
+	movdqa		14*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	nop
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+	dec		$num
+	nop
+	sha256rnds2	$CDGH,$ABEF
+
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movdqu	$ABEF,($ctx)
+	movdqu	$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-8-5*16(%rax),%xmm6
+	movaps	-8-4*16(%rax),%xmm7
+	movaps	-8-3*16(%rax),%xmm8
+	movaps	-8-2*16(%rax),%xmm9
+	movaps	-8-1*16(%rax),%xmm10
+	mov	%rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+	ret
+.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
+	'&mov	($a,$a1)',
+	'&mov	($a4,$f)',
+
+	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
+	'&xor	($a0,$e)',
+	'&xor	($a4,$g)',			# f^g
+
+	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
+	'&xor	($a1,$a)',
+	'&and	($a4,$e)',			# (f^g)&e
+
+	'&xor	($a0,$e)',
+	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
+	'&mov	($a2,$a)',
+
+	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
+	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
+	'&xor	($a2,$b)',			# a^b, b^c in next round
+
+	'&add	($h,$a4)',			# h+=Ch(e,f,g)
+	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
+	'&and	($a3,$a2)',			# (b^c)&(a^b)
+
+	'&xor	($a1,$a)',
+	'&add	($h,$a0)',			# h+=Sigma1(e)
+	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
+
+	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
+	'&add	($d,$h)',			# d+=h
+	'&add	($h,$a3)',			# h+=Maj(a,b,c)
+
+	'&mov	($a0,$d)',
+	'&add	($a1,$h);'.			# h+=Sigma0(a)
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+}
+
+######################################################################
+# SSSE3 code path
+#
+if ($SZ==4) {	# SHA256 only
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.type	${func}_ssse3,\@function,3
+.align	64
+${func}_ssse3:
+.Lssse3_shortcut:
+	mov	%rsp,%rax		# copy %rsp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$`$framesz+$win64*16*4`,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___;
+.Lprologue_ssse3:
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+
+$code.=<<___;
+	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	movdqu	0x00($inp),@X[0]
+	movdqu	0x10($inp),@X[1]
+	movdqu	0x20($inp),@X[2]
+	pshufb	$t3,@X[0]
+	movdqu	0x30($inp),@X[3]
+	lea	$TABLE(%rip),$Tbl
+	pshufb	$t3,@X[1]
+	movdqa	0x00($Tbl),$t0
+	movdqa	0x20($Tbl),$t1
+	pshufb	$t3,@X[2]
+	paddd	@X[0],$t0
+	movdqa	0x40($Tbl),$t2
+	pshufb	$t3,@X[3]
+	movdqa	0x60($Tbl),$t3
+	paddd	@X[1],$t1
+	paddd	@X[2],$t2
+	paddd	@X[3],$t3
+	movdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	movdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	movdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	movdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_SSSE3 () {
+	(
+	'&movdqa	($t0,@X[1]);',
+	'&movdqa	($t3,@X[3])',
+	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
+	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
+	'&movdqa	($t1,$t0)',
+	'&movdqa	($t2,$t0);',
+	'&psrld		($t0,$sigma0[2])',
+	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
+	'&psrld		($t2,$sigma0[0])',
+	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
+	'&pxor		($t0,$t2)',
+	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t1)',
+	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t2);',
+	 '&movdqa	($t2,$t3)',
+	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
+	 '&psrld	($t3,$sigma1[2])',
+	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2)',
+	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
+	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
+	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&movdqa	($t2,$t3);',
+	 '&psrld	($t3,$sigma1[2])',
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	'&movdqa	($t2,16*2*$j."($Tbl)")',
+	 '&pshufb	($t3,$t5)',
+	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+    if (0) {
+	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+    } else {			# squeeze extra 4% on Westmere and 19% on Atom
+	  eval(shift(@insns));	#@
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t2,$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&pslld		($t1,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&psrld		($t2,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 #&pshufb	($t3,$t5);
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,16*2*$j."($Tbl)");
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+    }
+	&paddd		($t2,@X[0]);
+	  foreach (@insns) { eval; }		# remaining instructions
+	&movdqa		(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
+	&jne	(".Lssse3_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+
+	add	$SZ*0($ctx),$A
+	lea	16*$SZ($inp),$inp
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_ssse3
+
+	mov	$_rsp,%rsi
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	${func}_ssse3,.-${func}_ssse3
+___
+}
+
+if ($avx) {{
+######################################################################
+# XOP code path
+#
+if ($SZ==8) {	# SHA512 only
+$code.=<<___;
+.type	${func}_xop,\@function,3
+.align	64
+${func}_xop:
+.Lxop_shortcut:
+	mov	%rsp,%rax		# copy %rsp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	%xmm10,16*$SZ+96(%rsp)
+	movaps	%xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_xop:
+
+	vzeroupper
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+	jmp	.Lloop_xop
+___
+					if ($SZ==4) {	# SHA256
+    my @X = map("%xmm$_",(0..3));
+    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+.align	16
+.Lloop_xop:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[0],@X[0]
+	lea	$TABLE(%rip),$Tbl
+	vpshufb	$t3,@X[1],@X[1]
+	vpshufb	$t3,@X[2],@X[2]
+	vpaddd	0x00($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[3],@X[3]
+	vpaddd	0x20($Tbl),@X[1],$t1
+	vpaddd	0x40($Tbl),@X[2],$t2
+	vpaddd	0x60($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lxop_00_47
+
+.align	16
+.Lxop_00_47:
+	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
+___
+sub XOP_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpsrld		($t0,$t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpxor		($t0,$t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpsrld	($t2,@X[3],$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpsrldq	($t3,$t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpsrld	($t2,@X[0],$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpslldq	($t3,$t3,8);		# 22 instructions
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&XOP_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
+	&jne	(".Lxop_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+
+					} else {	# SHA512
+    my @X = map("%xmm$_",(0..7));
+    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
+
+$code.=<<___;
+.align	16
+.Lloop_xop:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vpshufb	$t3,@X[0],@X[0]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[1],@X[1]
+	vmovdqu	0x40($inp),@X[4]
+	vpshufb	$t3,@X[2],@X[2]
+	vmovdqu	0x50($inp),@X[5]
+	vpshufb	$t3,@X[3],@X[3]
+	vmovdqu	0x60($inp),@X[6]
+	vpshufb	$t3,@X[4],@X[4]
+	vmovdqu	0x70($inp),@X[7]
+	vpshufb	$t3,@X[5],@X[5]
+	vpaddq	-0x80($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[6],@X[6]
+	vpaddq	-0x60($Tbl),@X[1],$t1
+	vpshufb	$t3,@X[7],@X[7]
+	vpaddq	-0x40($Tbl),@X[2],$t2
+	vpaddq	-0x20($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	vpaddq	0x00($Tbl),@X[4],$t0
+	vmovdqa	$t1,0x10(%rsp)
+	vpaddq	0x20($Tbl),@X[5],$t1
+	vmovdqa	$t2,0x20(%rsp)
+	vpaddq	0x40($Tbl),@X[6],$t2
+	vmovdqa	$t3,0x30(%rsp)
+	vpaddq	0x60($Tbl),@X[7],$t3
+	vmovdqa	$t0,0x40(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x50(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x60(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x70(%rsp)
+	mov	$E,$a0
+	jmp	.Lxop_00_47
+
+.align	16
+.Lxop_00_47:
+	add	\$`16*2*$SZ`,$Tbl
+___
+sub XOP_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body);			# 52 instructions
+
+	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpsrlq		($t0,$t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpxor		($t0,$t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpsrlq	($t2,@X[7],$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<8; $j++) {
+	&XOP_512_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
+	&jne	(".Lxop_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+}
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+
+	add	$SZ*0($ctx),$A
+	lea	16*$SZ($inp),$inp
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_xop
+
+	mov	$_rsp,%rsi
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	16*$SZ+96(%rsp),%xmm10
+	movaps	16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue_xop:
+	ret
+.size	${func}_xop,.-${func}_xop
+___
+}
+######################################################################
+# AVX+shrd code path
+#
+local *ror = sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	${func}_avx,\@function,3
+.align	64
+${func}_avx:
+.Lavx_shortcut:
+	mov	%rsp,%rax		# copy %rsp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	and	\$-64,%rsp		# align stack frame
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	%xmm10,16*$SZ+96(%rsp)
+	movaps	%xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx:
+
+	vzeroupper
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+					if ($SZ==4) {	# SHA256
+    my @X = map("%xmm$_",(0..3));
+    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[0],@X[0]
+	lea	$TABLE(%rip),$Tbl
+	vpshufb	$t3,@X[1],@X[1]
+	vpshufb	$t3,@X[2],@X[2]
+	vpaddd	0x00($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[3],@X[3]
+	vpaddd	0x20($Tbl),@X[1],$t1
+	vpaddd	0x40($Tbl),@X[2],$t2
+	vpaddd	0x60($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_AVX () {
+	(
+	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
+	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
+	'&vpsrld	($t2,$t0,$sigma0[0]);',
+	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
+	'&vpsrld	($t3,$t0,$sigma0[2])',
+	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
+	'&vpxor		($t0,$t3,$t2)',
+	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
+	'&vpxor		($t0,$t0,$t1)',
+	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
+	'&vpxor		($t0,$t0,$t2)',
+	 '&vpsrld	($t2,$t3,$sigma1[2]);',
+	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
+	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
+	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
+	 '&vpxor	($t2,$t2,$t3);',
+	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3)',
+	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
+	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
+	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&vpsrld	($t2,$t3,$sigma1[2])',
+	 '&vpsrlq	($t3,$t3,$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3);',
+	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
+	 '&vpxor	($t2,$t2,$t3)',
+	 '&vpshufb	($t2,$t2,$t5)',
+	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub AVX_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+	foreach (Xupdate_256_AVX()) {		# 29 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&AVX_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
+	&jne	(".Lavx_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+
+					} else {	# SHA512
+    my @X = map("%xmm$_",(0..7));
+    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
+
+$code.=<<___;
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	0x00($inp),@X[0]
+	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
+	vmovdqu	0x10($inp),@X[1]
+	vmovdqu	0x20($inp),@X[2]
+	vpshufb	$t3,@X[0],@X[0]
+	vmovdqu	0x30($inp),@X[3]
+	vpshufb	$t3,@X[1],@X[1]
+	vmovdqu	0x40($inp),@X[4]
+	vpshufb	$t3,@X[2],@X[2]
+	vmovdqu	0x50($inp),@X[5]
+	vpshufb	$t3,@X[3],@X[3]
+	vmovdqu	0x60($inp),@X[6]
+	vpshufb	$t3,@X[4],@X[4]
+	vmovdqu	0x70($inp),@X[7]
+	vpshufb	$t3,@X[5],@X[5]
+	vpaddq	-0x80($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[6],@X[6]
+	vpaddq	-0x60($Tbl),@X[1],$t1
+	vpshufb	$t3,@X[7],@X[7]
+	vpaddq	-0x40($Tbl),@X[2],$t2
+	vpaddq	-0x20($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	vpaddq	0x00($Tbl),@X[4],$t0
+	vmovdqa	$t1,0x10(%rsp)
+	vpaddq	0x20($Tbl),@X[5],$t1
+	vmovdqa	$t2,0x20(%rsp)
+	vpaddq	0x40($Tbl),@X[6],$t2
+	vmovdqa	$t3,0x30(%rsp)
+	vpaddq	0x60($Tbl),@X[7],$t3
+	vmovdqa	$t0,0x40(%rsp)
+	mov	$A,$a1
+	vmovdqa	$t1,0x50(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x60(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x70(%rsp)
+	mov	$E,$a0
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	add	\$`16*2*$SZ`,$Tbl
+___
+sub Xupdate_512_AVX () {
+	(
+	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
+	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
+	'&vpsrlq	($t2,$t0,$sigma0[0])',
+	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
+	'&vpsrlq	($t3,$t0,$sigma0[2])',
+	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
+	 '&vpxor	($t0,$t3,$t2)',
+	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
+	 '&vpxor	($t0,$t0,$t1)',
+	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
+	 '&vpxor	($t0,$t0,$t2)',
+	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
+	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
+	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
+	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
+	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t2)',
+	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t1)',
+	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
+	 '&vpxor	($t3,$t3,$t2)',
+	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
+	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
+	);
+}
+
+sub AVX_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body);			# 52 instructions
+
+	foreach (Xupdate_512_AVX()) {		# 23 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<8; $j++) {
+	&AVX_512_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
+	&jne	(".Lavx_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+}
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+
+	add	$SZ*0($ctx),$A
+	lea	16*$SZ($inp),$inp
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_avx
+
+	mov	$_rsp,%rsi
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	16*$SZ+96(%rsp),%xmm10
+	movaps	16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	${func}_avx,.-${func}_avx
+___
+
+if ($avx>1) {{
+######################################################################
+# AVX2+BMI code path
+#
+my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
+my $PUSH8=8*2*$SZ;
+use integer;
+
+sub bodyx_00_15 () {
+	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
+	'&and	($a4,$e)',		# f&e
+	'&rorx	($a0,$e,$Sigma1[2])',
+	'&rorx	($a2,$e,$Sigma1[1])',
+
+	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
+	'&lea	($h,"($h,$a4)")',
+	'&andn	($a4,$e,$g)',		# ~e&g
+	'&xor	($a0,$a2)',
+
+	'&rorx	($a1,$e,$Sigma1[0])',
+	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
+	'&xor	($a0,$a1)',		# Sigma1(e)
+	'&mov	($a2,$a)',
+
+	'&rorx	($a4,$a,$Sigma0[2])',
+	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
+	'&xor	($a2,$b)',		# a^b, b^c in next round
+	'&rorx	($a1,$a,$Sigma0[1])',
+
+	'&rorx	($a0,$a,$Sigma0[0])',
+	'&lea	($d,"($d,$h)")',	# d+=h
+	'&and	($a3,$a2)',		# (b^c)&(a^b)
+	'&xor	($a1,$a4)',
+
+	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
+	'&xor	($a1,$a0)',		# Sigma0(a)
+	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
+	'&mov	($a4,$e)',		# copy of f in future
+
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+	# and at the finish one has to $a+=$a1
+}
+
+$code.=<<___;
+.type	${func}_avx2,\@function,3
+.align	64
+${func}_avx2:
+.Lavx2_shortcut:
+	mov	%rsp,%rax		# copy %rsp
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
+	shl	\$4,%rdx		# num*16
+	and	\$-256*$SZ,%rsp		# align stack frame
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	add	\$`2*$SZ*($rounds-8)`,%rsp
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+	mov	%rax,$_rsp		# save copy of %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,16*$SZ+32(%rsp)
+	movaps	%xmm7,16*$SZ+48(%rsp)
+	movaps	%xmm8,16*$SZ+64(%rsp)
+	movaps	%xmm9,16*$SZ+80(%rsp)
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	%xmm10,16*$SZ+96(%rsp)
+	movaps	%xmm11,16*$SZ+112(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx2:
+
+	vzeroupper
+	sub	\$-16*$SZ,$inp		# inp++, size optimization
+	mov	$SZ*0($ctx),$A
+	mov	$inp,%r12		# borrow $T1
+	mov	$SZ*1($ctx),$B
+	cmp	%rdx,$inp		# $_end
+	mov	$SZ*2($ctx),$C
+	cmove	%rsp,%r12		# next block or random data
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+					if ($SZ==4) {	# SHA256
+    my @X = map("%ymm$_",(0..3));
+    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
+
+$code.=<<___;
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
+	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
+	jmp	.Loop_avx2
+.align	16
+.Loop_avx2:
+	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
+	vmovdqu	-16*$SZ+0($inp),%xmm0
+	vmovdqu	-16*$SZ+16($inp),%xmm1
+	vmovdqu	-16*$SZ+32($inp),%xmm2
+	vmovdqu	-16*$SZ+48($inp),%xmm3
+	#mov		$inp,$_inp	# offload $inp
+	vinserti128	\$1,(%r12),@X[0],@X[0]
+	vinserti128	\$1,16(%r12),@X[1],@X[1]
+	vpshufb		$t3,@X[0],@X[0]
+	vinserti128	\$1,32(%r12),@X[2],@X[2]
+	vpshufb		$t3,@X[1],@X[1]
+	vinserti128	\$1,48(%r12),@X[3],@X[3]
+
+	lea	$TABLE(%rip),$Tbl
+	vpshufb	$t3,@X[2],@X[2]
+	vpaddd	0x00($Tbl),@X[0],$t0
+	vpshufb	$t3,@X[3],@X[3]
+	vpaddd	0x20($Tbl),@X[1],$t1
+	vpaddd	0x40($Tbl),@X[2],$t2
+	vpaddd	0x60($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	xor	$a1,$a1
+	vmovdqa	$t1,0x20(%rsp)
+	lea	-$PUSH8(%rsp),%rsp
+	mov	$B,$a3
+	vmovdqa	$t2,0x00(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x20(%rsp)
+	mov	$F,$a4
+	sub	\$-16*2*$SZ,$Tbl	# size optimization
+	jmp	.Lavx2_00_47
+
+.align	16
+.Lavx2_00_47:
+___
+
+sub AVX2_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
+	foreach (Xupdate_256_AVX()) {		# 29 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&lea	($Tbl,16*2*$SZ."($Tbl)");
+	&cmpb	(($SZ-1)."($Tbl)",0);
+	&jne	(".Lavx2_00_47");
+
+    for ($i=0; $i<16; ) {
+	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+	foreach(bodyx_00_15()) { eval; }
+    }
+					} else {	# SHA512
+    my @X = map("%ymm$_",(0..7));
+    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
+
+$code.=<<___;
+	jmp	.Loop_avx2
+.align	16
+.Loop_avx2:
+	vmovdqu	-16*$SZ($inp),%xmm0
+	vmovdqu	-16*$SZ+16($inp),%xmm1
+	vmovdqu	-16*$SZ+32($inp),%xmm2
+	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
+	vmovdqu	-16*$SZ+48($inp),%xmm3
+	vmovdqu	-16*$SZ+64($inp),%xmm4
+	vmovdqu	-16*$SZ+80($inp),%xmm5
+	vmovdqu	-16*$SZ+96($inp),%xmm6
+	vmovdqu	-16*$SZ+112($inp),%xmm7
+	#mov	$inp,$_inp	# offload $inp
+	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
+	vinserti128	\$1,(%r12),@X[0],@X[0]
+	vinserti128	\$1,16(%r12),@X[1],@X[1]
+	 vpshufb	$t2,@X[0],@X[0]
+	vinserti128	\$1,32(%r12),@X[2],@X[2]
+	 vpshufb	$t2,@X[1],@X[1]
+	vinserti128	\$1,48(%r12),@X[3],@X[3]
+	 vpshufb	$t2,@X[2],@X[2]
+	vinserti128	\$1,64(%r12),@X[4],@X[4]
+	 vpshufb	$t2,@X[3],@X[3]
+	vinserti128	\$1,80(%r12),@X[5],@X[5]
+	 vpshufb	$t2,@X[4],@X[4]
+	vinserti128	\$1,96(%r12),@X[6],@X[6]
+	 vpshufb	$t2,@X[5],@X[5]
+	vinserti128	\$1,112(%r12),@X[7],@X[7]
+
+	vpaddq	-0x80($Tbl),@X[0],$t0
+	vpshufb	$t2,@X[6],@X[6]
+	vpaddq	-0x60($Tbl),@X[1],$t1
+	vpshufb	$t2,@X[7],@X[7]
+	vpaddq	-0x40($Tbl),@X[2],$t2
+	vpaddq	-0x20($Tbl),@X[3],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	vpaddq	0x00($Tbl),@X[4],$t0
+	vmovdqa	$t1,0x20(%rsp)
+	vpaddq	0x20($Tbl),@X[5],$t1
+	vmovdqa	$t2,0x40(%rsp)
+	vpaddq	0x40($Tbl),@X[6],$t2
+	vmovdqa	$t3,0x60(%rsp)
+	lea	-$PUSH8(%rsp),%rsp
+	vpaddq	0x60($Tbl),@X[7],$t3
+	vmovdqa	$t0,0x00(%rsp)
+	xor	$a1,$a1
+	vmovdqa	$t1,0x20(%rsp)
+	mov	$B,$a3
+	vmovdqa	$t2,0x40(%rsp)
+	xor	$C,$a3			# magic
+	vmovdqa	$t3,0x60(%rsp)
+	mov	$F,$a4
+	add	\$16*2*$SZ,$Tbl
+	jmp	.Lavx2_00_47
+
+.align	16
+.Lavx2_00_47:
+___
+
+sub AVX2_512_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body);			# 48 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
+	foreach (Xupdate_512_AVX()) {		# 23 instructions
+	    eval;
+	    if ($_ !~ /\;$/) {
+		eval(shift(@insns));
+		eval(shift(@insns));
+		eval(shift(@insns));
+	    }
+	}
+	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
+	  foreach (@insns) { eval; }		# remaining instructions
+	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<8; $j++) {
+	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&lea	($Tbl,16*2*$SZ."($Tbl)");
+	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
+	&jne	(".Lavx2_00_47");
+
+    for ($i=0; $i<16; ) {
+	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+	foreach(bodyx_00_15()) { eval; }
+    }
+}
+$code.=<<___;
+	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
+	add	$a1,$A
+	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
+	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+
+	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
+	je	.Ldone_avx2
+
+	xor	$a1,$a1
+	mov	$B,$a3
+	xor	$C,$a3			# magic
+	mov	$F,$a4
+	jmp	.Lower_avx2
+.align	16
+.Lower_avx2:
+___
+    for ($i=0; $i<8; ) {
+	my $base="+16($Tbl)";
+	foreach(bodyx_00_15()) { eval; }
+    }
+$code.=<<___;
+	lea	-$PUSH8($Tbl),$Tbl
+	cmp	%rsp,$Tbl
+	jae	.Lower_avx2
+
+	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
+	add	$a1,$A
+	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
+	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	lea	`2*16*$SZ`($inp),$inp	# inp+=2
+	add	$SZ*6($ctx),$G
+	mov	$inp,%r12
+	add	$SZ*7($ctx),$H
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	cmove	%rsp,%r12		# next block or stale data
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+
+	jbe	.Loop_avx2
+	lea	(%rsp),$Tbl
+
+.Ldone_avx2:
+	lea	($Tbl),%rsp
+	mov	$_rsp,%rsi
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	16*$SZ+32(%rsp),%xmm6
+	movaps	16*$SZ+48(%rsp),%xmm7
+	movaps	16*$SZ+64(%rsp),%xmm8
+	movaps	16*$SZ+80(%rsp),%xmm9
+___
+$code.=<<___ if ($win64 && $SZ>4);
+	movaps	16*$SZ+96(%rsp),%xmm10
+	movaps	16*$SZ+112(%rsp),%xmm11
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lepilogue_avx2:
+	ret
+.size	${func}_avx2,.-${func}_avx2
+___
+}}
+}}}}}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HanderlData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+___
+$code.=<<___ if ($avx>1);
+	lea	.Lavx2_shortcut(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
+	jb	.Lnot_in_avx2
+
+	and	\$-256*$SZ,%rax
+	add	\$`2*$SZ*($rounds-8)`,%rax
+.Lnot_in_avx2:
+___
+$code.=<<___;
+	mov	%rax,%rsi		# put aside Rsp
+	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lin_prologue		# non-AVX code
+
+	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$`$SZ==4?8:12`,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+___
+
+$code.=<<___ if ($SZ==4 && $shaext);
+.type	shaext_handler,\@abi-omnipotent
+.align	16
+shaext_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	lea	.Lepilogue_shaext(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	lea	-8-5*16(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lin_prologue
+.size	shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+	.rva	.LSEH_begin_${func}_shaext
+	.rva	.LSEH_end_${func}_shaext
+	.rva	.LSEH_info_${func}_shaext
+___
+$code.=<<___ if ($SZ==4);
+	.rva	.LSEH_begin_${func}_ssse3
+	.rva	.LSEH_end_${func}_ssse3
+	.rva	.LSEH_info_${func}_ssse3
+___
+$code.=<<___ if ($avx && $SZ==8);
+	.rva	.LSEH_begin_${func}_xop
+	.rva	.LSEH_end_${func}_xop
+	.rva	.LSEH_info_${func}_xop
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_${func}_avx
+	.rva	.LSEH_end_${func}_avx
+	.rva	.LSEH_info_${func}_avx
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_${func}_avx2
+	.rva	.LSEH_end_${func}_avx2
+	.rva	.LSEH_info_${func}_avx2
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue,.Lepilogue			# HandlerData[]
+___
+$code.=<<___ if ($SZ==4 && $shaext);
+.LSEH_info_${func}_shaext:
+	.byte	9,0,0,0
+	.rva	shaext_handler
+___
+$code.=<<___ if ($SZ==4);
+.LSEH_info_${func}_ssse3:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx && $SZ==8);
+.LSEH_info_${func}_xop:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_${func}_avx:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_${func}_avx2:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
+___
+}
+
+sub sha256op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x38);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT;

diff --git a/crypto/fipsmodule/sha/sha1-altivec.c b/crypto/fipsmodule/sha/sha1-altivec.c
new file mode 100644
index 0000000..14e2bae
--- /dev/null
+++ b/crypto/fipsmodule/sha/sha1-altivec.c

@@ -0,0 +1,361 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+/* Altivec-optimized SHA1 in C. This is tested on ppc64le only.
+ *
+ * References:
+ * https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ * http://arctic.org/~dean/crypto/sha1.html
+ *
+ * This code used the generic SHA-1 from OpenSSL as a basis and AltiVec
+ * optimisations were added on top. */
+
+#include <openssl/sha.h>
+
+#if defined(OPENSSL_PPC64LE)
+
+#include <altivec.h>
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
+
+static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); }
+
+typedef vector unsigned int vec_uint32_t;
+typedef vector unsigned char vec_uint8_t;
+
+/* Vector constants */
+static const vec_uint8_t k_swap_endianness = {3,  2,  1, 0, 7,  6,  5,  4,
+                                              11, 10, 9, 8, 15, 14, 13, 12};
+
+/* Shift amounts for byte and bit shifts and rotations */
+static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32,
+                                      32, 32, 32, 32, 32, 32, 32, 32};
+static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96,
+                                       96, 96, 96, 96, 96, 96, 96, 96};
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+/* Vector versions of the above. */
+static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19};
+static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39};
+static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59};
+static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79};
+
+/* vector message scheduling: compute message schedule for round i..i+3 where i
+ * is divisible by 4. We return the schedule w[i..i+3] as a vector. In
+ * addition, we also precompute sum w[i..+3] and an additive constant K. This
+ * is done to offload some computation of f() in the integer execution units.
+ *
+ * Byte shifting code below may not be correct for big-endian systems. */
+static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data,
+                                vec_uint32_t k) {
+  const vector unsigned char unaligned_data =
+    vec_vsx_ld(0, (const unsigned char*) data);
+  const vec_uint32_t v = (vec_uint32_t) unaligned_data;
+  const vec_uint32_t w = vec_perm(v, v, k_swap_endianness);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+/* Compute w[i..i+3] using these steps for i in [16, 20, 24, 28]
+ *
+ * w'[i  ]  = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1
+ * w'[i+1]  = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1
+ * w'[i+2]  = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1
+ * w'[i+3]  = (     0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1
+ *
+ * w[  i] = w'[  i]
+ * w[i+1] = w'[i+1]
+ * w[i+2] = w'[i+2]
+ * w[i+3] = w'[i+3] ^ (w'[i] <<< 1) */
+static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
+                                vec_uint32_t minus_8, vec_uint32_t minus_12,
+                                vec_uint32_t minus_16, vec_uint32_t k) {
+  const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
+  const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8);
+  const vec_uint32_t k_1_bit = vec_splat_u32(1);
+  const vec_uint32_t w_prime =
+      vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
+  const vec_uint32_t w =
+      w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+/* Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76]
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2 */
+static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
+                                vec_uint32_t minus_8, vec_uint32_t minus_16,
+                                vec_uint32_t minus_28, vec_uint32_t minus_32,
+                                vec_uint32_t k) {
+  const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8);
+  const vec_uint32_t k_2_bits = vec_splat_u32(2);
+  const vec_uint32_t w =
+      vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+/* As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
+ * to the code in F_00_19. Wei attributes these optimisations to Peter
+ * Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
+ * F(x,y,z) (((x) & (y))  |  ((~(x)) & (z))) I've just become aware of another
+ * tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a */
+#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
+#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
+#define F_60_79(b, c, d) F_20_39(b, c, d)
+
+/* We pre-added the K constants during message scheduling. */
+#define BODY_00_19(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_20_39(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_40_59(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_60_79(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
+  uint32_t A, B, C, D, E, T;
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+  E = state[4];
+
+  for (;;) {
+    vec_uint32_t vw[20];
+    const uint32_t *w = (const uint32_t *)&vw;
+
+    vec_uint32_t k = K_00_19_x_4;
+    const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k);
+    BODY_00_19(0, A, B, C, D, E, T);
+    BODY_00_19(1, T, A, B, C, D, E);
+    BODY_00_19(2, E, T, A, B, C, D);
+    BODY_00_19(3, D, E, T, A, B, C);
+
+    const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k);
+    BODY_00_19(4, C, D, E, T, A, B);
+    BODY_00_19(5, B, C, D, E, T, A);
+    BODY_00_19(6, A, B, C, D, E, T);
+    BODY_00_19(7, T, A, B, C, D, E);
+
+    const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k);
+    BODY_00_19(8, E, T, A, B, C, D);
+    BODY_00_19(9, D, E, T, A, B, C);
+    BODY_00_19(10, C, D, E, T, A, B);
+    BODY_00_19(11, B, C, D, E, T, A);
+
+    const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k);
+    BODY_00_19(12, A, B, C, D, E, T);
+    BODY_00_19(13, T, A, B, C, D, E);
+    BODY_00_19(14, E, T, A, B, C, D);
+    BODY_00_19(15, D, E, T, A, B, C);
+
+    const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k);
+    BODY_00_19(16, C, D, E, T, A, B);
+    BODY_00_19(17, B, C, D, E, T, A);
+    BODY_00_19(18, A, B, C, D, E, T);
+    BODY_00_19(19, T, A, B, C, D, E);
+
+    k = K_20_39_x_4;
+    const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k);
+    BODY_20_39(20, E, T, A, B, C, D);
+    BODY_20_39(21, D, E, T, A, B, C);
+    BODY_20_39(22, C, D, E, T, A, B);
+    BODY_20_39(23, B, C, D, E, T, A);
+
+    const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k);
+    BODY_20_39(24, A, B, C, D, E, T);
+    BODY_20_39(25, T, A, B, C, D, E);
+    BODY_20_39(26, E, T, A, B, C, D);
+    BODY_20_39(27, D, E, T, A, B, C);
+
+    const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k);
+    BODY_20_39(28, C, D, E, T, A, B);
+    BODY_20_39(29, B, C, D, E, T, A);
+    BODY_20_39(30, A, B, C, D, E, T);
+    BODY_20_39(31, T, A, B, C, D, E);
+
+    const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k);
+    BODY_20_39(32, E, T, A, B, C, D);
+    BODY_20_39(33, D, E, T, A, B, C);
+    BODY_20_39(34, C, D, E, T, A, B);
+    BODY_20_39(35, B, C, D, E, T, A);
+
+    const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k);
+    BODY_20_39(36, A, B, C, D, E, T);
+    BODY_20_39(37, T, A, B, C, D, E);
+    BODY_20_39(38, E, T, A, B, C, D);
+    BODY_20_39(39, D, E, T, A, B, C);
+
+    k = K_40_59_x_4;
+    const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k);
+    BODY_40_59(40, C, D, E, T, A, B);
+    BODY_40_59(41, B, C, D, E, T, A);
+    BODY_40_59(42, A, B, C, D, E, T);
+    BODY_40_59(43, T, A, B, C, D, E);
+
+    const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k);
+    BODY_40_59(44, E, T, A, B, C, D);
+    BODY_40_59(45, D, E, T, A, B, C);
+    BODY_40_59(46, C, D, E, T, A, B);
+    BODY_40_59(47, B, C, D, E, T, A);
+
+    const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k);
+    BODY_40_59(48, A, B, C, D, E, T);
+    BODY_40_59(49, T, A, B, C, D, E);
+    BODY_40_59(50, E, T, A, B, C, D);
+    BODY_40_59(51, D, E, T, A, B, C);
+
+    const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k);
+    BODY_40_59(52, C, D, E, T, A, B);
+    BODY_40_59(53, B, C, D, E, T, A);
+    BODY_40_59(54, A, B, C, D, E, T);
+    BODY_40_59(55, T, A, B, C, D, E);
+
+    const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k);
+    BODY_40_59(56, E, T, A, B, C, D);
+    BODY_40_59(57, D, E, T, A, B, C);
+    BODY_40_59(58, C, D, E, T, A, B);
+    BODY_40_59(59, B, C, D, E, T, A);
+
+    k = K_60_79_x_4;
+    const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k);
+    BODY_60_79(60, A, B, C, D, E, T);
+    BODY_60_79(61, T, A, B, C, D, E);
+    BODY_60_79(62, E, T, A, B, C, D);
+    BODY_60_79(63, D, E, T, A, B, C);
+
+    const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k);
+    BODY_60_79(64, C, D, E, T, A, B);
+    BODY_60_79(65, B, C, D, E, T, A);
+    BODY_60_79(66, A, B, C, D, E, T);
+    BODY_60_79(67, T, A, B, C, D, E);
+
+    const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k);
+    BODY_60_79(68, E, T, A, B, C, D);
+    BODY_60_79(69, D, E, T, A, B, C);
+    BODY_60_79(70, C, D, E, T, A, B);
+    BODY_60_79(71, B, C, D, E, T, A);
+
+    const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k);
+    BODY_60_79(72, A, B, C, D, E, T);
+    BODY_60_79(73, T, A, B, C, D, E);
+    BODY_60_79(74, E, T, A, B, C, D);
+    BODY_60_79(75, D, E, T, A, B, C);
+
+    /* We don't use the last value */
+    (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k);
+    BODY_60_79(76, C, D, E, T, A, B);
+    BODY_60_79(77, B, C, D, E, T, A);
+    BODY_60_79(78, A, B, C, D, E, T);
+    BODY_60_79(79, T, A, B, C, D, E);
+
+    const uint32_t mask = 0xffffffffUL;
+    state[0] = (state[0] + E) & mask;
+    state[1] = (state[1] + T) & mask;
+    state[2] = (state[2] + A) & mask;
+    state[3] = (state[3] + B) & mask;
+    state[4] = (state[4] + C) & mask;
+
+    data += 64;
+    if (--num == 0) {
+      break;
+    }
+
+    A = state[0];
+    B = state[1];
+    C = state[2];
+    D = state[3];
+    E = state[4];
+  }
+}
+
+#endif  /* OPENSSL_PPC64LE */
+
+#undef K_00_19
+#undef K_20_39
+#undef K_40_59
+#undef K_60_79
+#undef F_00_19
+#undef F_20_39
+#undef F_40_59
+#undef F_60_79
+#undef BODY_00_19
+#undef BODY_20_39
+#undef BODY_40_59
+#undef BODY_60_79

diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
new file mode 100644
index 0000000..7b44563
--- /dev/null
+++ b/crypto/fipsmodule/sha/sha1.c

@@ -0,0 +1,375 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) || \
+     defined(OPENSSL_PPC64LE))
+#define SHA1_ASM
+#endif
+
+int SHA1_Init(SHA_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
+  sha->h[0] = 0x67452301UL;
+  sha->h[1] = 0xefcdab89UL;
+  sha->h[2] = 0x98badcfeUL;
+  sha->h[3] = 0x10325476UL;
+  sha->h[4] = 0xc3d2e1f0UL;
+  return 1;
+}
+
+uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t *out) {
+  SHA_CTX ctx;
+  SHA1_Init(&ctx);
+  SHA1_Update(&ctx, data, len);
+  SHA1_Final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+#define DATA_ORDER_IS_BIG_ENDIAN
+
+#define HASH_CTX                SHA_CTX
+#define HASH_CBLOCK             64
+#define HASH_MAKE_STRING(c, s) \
+  do {                         \
+    uint32_t ll;               \
+    ll = (c)->h[0];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[1];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[2];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[3];            \
+    HOST_l2c(ll, (s));         \
+    ll = (c)->h[4];            \
+    HOST_l2c(ll, (s));         \
+  } while (0)
+
+#define HASH_UPDATE SHA1_Update
+#define HASH_TRANSFORM SHA1_Transform
+#define HASH_FINAL SHA1_Final
+#define HASH_BLOCK_DATA_ORDER sha1_block_data_order
+#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
+#define Xupdate(a, ix, ia, ib, ic, id) \
+  do {                                 \
+    (a) = ((ia) ^ (ib) ^ (ic) ^ (id)); \
+    (ix) = (a) = ROTATE((a), 1);       \
+  } while (0)
+
+#ifndef SHA1_ASM
+static
+#endif
+void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
+
+#include "../digest/md32_common.h"
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+/* As  pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
+ * to the code in F_00_19.  Wei attributes these optimisations to Peter
+ * Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
+ * F(x,y,z) (((x) & (y))  |  ((~(x)) & (z))) I've just become aware of another
+ * tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a */
+#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
+#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
+#define F_60_79(b, c, d) F_20_39(b, c, d)
+
+#define BODY_00_15(i, a, b, c, d, e, f, xi)                               \
+  do {                                                                    \
+    (f) = (xi) + (e) + K_00_19 + ROTATE((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                                \
+  } while (0)
+
+#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)         \
+  do {                                                              \
+    Xupdate(f, xi, xa, xb, xc, xd);                                 \
+    (f) += (e) + K_00_19 + ROTATE((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                          \
+  } while (0)
+
+#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)         \
+  do {                                                              \
+    Xupdate(f, xi, xa, xb, xc, xd);                                 \
+    (f) += (e) + K_20_39 + ROTATE((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                          \
+  } while (0)
+
+#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd)             \
+  do {                                                              \
+    Xupdate(f, xa, xa, xb, xc, xd);                                 \
+    (f) += (e) + K_20_39 + ROTATE((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                          \
+  } while (0)
+
+#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd)             \
+  do {                                                              \
+    Xupdate(f, xa, xa, xb, xc, xd);                                 \
+    (f) += (e) + K_40_59 + ROTATE((a), 5) + F_40_59((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                          \
+  } while (0)
+
+#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd)                   \
+  do {                                                                    \
+    Xupdate(f, xa, xa, xb, xc, xd);                                       \
+    (f) = (xa) + (e) + K_60_79 + ROTATE((a), 5) + F_60_79((b), (c), (d)); \
+    (b) = ROTATE((b), 30);                                                \
+  } while (0)
+
+#ifdef X
+#undef X
+#endif
+
+/* Originally X was an array. As it's automatic it's natural
+* to expect RISC compiler to accomodate at least part of it in
+* the register bank, isn't it? Unfortunately not all compilers
+* "find" this expectation reasonable:-( On order to make such
+* compilers generate better code I replace X[] with a bunch of
+* X0, X1, etc. See the function body below...
+*					<appro@fy.chalmers.se> */
+#define X(i)	XX##i
+
+#if !defined(SHA1_ASM)
+static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
+                                  size_t num) {
+  register uint32_t A, B, C, D, E, T, l;
+  uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
+      XX11, XX12, XX13, XX14, XX15;
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+  E = state[4];
+
+  for (;;) {
+    HOST_c2l(data, l);
+    X(0) = l;
+    HOST_c2l(data, l);
+    X(1) = l;
+    BODY_00_15(0, A, B, C, D, E, T, X(0));
+    HOST_c2l(data, l);
+    X(2) = l;
+    BODY_00_15(1, T, A, B, C, D, E, X(1));
+    HOST_c2l(data, l);
+    X(3) = l;
+    BODY_00_15(2, E, T, A, B, C, D, X(2));
+    HOST_c2l(data, l);
+    X(4) = l;
+    BODY_00_15(3, D, E, T, A, B, C, X(3));
+    HOST_c2l(data, l);
+    X(5) = l;
+    BODY_00_15(4, C, D, E, T, A, B, X(4));
+    HOST_c2l(data, l);
+    X(6) = l;
+    BODY_00_15(5, B, C, D, E, T, A, X(5));
+    HOST_c2l(data, l);
+    X(7) = l;
+    BODY_00_15(6, A, B, C, D, E, T, X(6));
+    HOST_c2l(data, l);
+    X(8) = l;
+    BODY_00_15(7, T, A, B, C, D, E, X(7));
+    HOST_c2l(data, l);
+    X(9) = l;
+    BODY_00_15(8, E, T, A, B, C, D, X(8));
+    HOST_c2l(data, l);
+    X(10) = l;
+    BODY_00_15(9, D, E, T, A, B, C, X(9));
+    HOST_c2l(data, l);
+    X(11) = l;
+    BODY_00_15(10, C, D, E, T, A, B, X(10));
+    HOST_c2l(data, l);
+    X(12) = l;
+    BODY_00_15(11, B, C, D, E, T, A, X(11));
+    HOST_c2l(data, l);
+    X(13) = l;
+    BODY_00_15(12, A, B, C, D, E, T, X(12));
+    HOST_c2l(data, l);
+    X(14) = l;
+    BODY_00_15(13, T, A, B, C, D, E, X(13));
+    HOST_c2l(data, l);
+    X(15) = l;
+    BODY_00_15(14, E, T, A, B, C, D, X(14));
+    BODY_00_15(15, D, E, T, A, B, C, X(15));
+
+    BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13));
+    BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14));
+    BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15));
+    BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0));
+
+    BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1));
+    BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2));
+    BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3));
+    BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4));
+    BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5));
+    BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6));
+    BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7));
+    BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8));
+    BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9));
+    BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10));
+    BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11));
+    BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12));
+
+    BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13));
+    BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14));
+    BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15));
+    BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0));
+    BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1));
+    BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2));
+    BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3));
+    BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4));
+
+    BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5));
+    BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6));
+    BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7));
+    BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8));
+    BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9));
+    BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10));
+    BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11));
+    BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12));
+    BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13));
+    BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14));
+    BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15));
+    BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0));
+    BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1));
+    BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2));
+    BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3));
+    BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4));
+    BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5));
+    BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6));
+    BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7));
+    BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8));
+
+    BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9));
+    BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10));
+    BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11));
+    BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12));
+    BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13));
+    BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14));
+    BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15));
+    BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0));
+    BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1));
+    BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2));
+    BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3));
+    BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4));
+    BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5));
+    BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6));
+    BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7));
+    BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8));
+    BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9));
+    BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10));
+    BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11));
+    BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12));
+
+    state[0] = (state[0] + E) & 0xffffffffL;
+    state[1] = (state[1] + T) & 0xffffffffL;
+    state[2] = (state[2] + A) & 0xffffffffL;
+    state[3] = (state[3] + B) & 0xffffffffL;
+    state[4] = (state[4] + C) & 0xffffffffL;
+
+    if (--num == 0) {
+      break;
+    }
+
+    A = state[0];
+    B = state[1];
+    C = state[2];
+    D = state[3];
+    E = state[4];
+  }
+}
+#endif
+
+#undef DATA_ORDER_IS_BIG_ENDIAN
+#undef HASH_CTX
+#undef HASH_CBLOCK
+#undef HASH_MAKE_STRING
+#undef HASH_UPDATE
+#undef HASH_TRANSFORM
+#undef HASH_FINAL
+#undef HASH_BLOCK_DATA_ORDER
+#undef ROTATE
+#undef Xupdate
+#undef K_00_19
+#undef K_20_39
+#undef K_40_59
+#undef K_60_79
+#undef F_00_19
+#undef F_20_39
+#undef F_40_59
+#undef F_60_79
+#undef BODY_00_15
+#undef BODY_16_19
+#undef BODY_20_31
+#undef BODY_32_39
+#undef BODY_40_59
+#undef BODY_60_79
+#undef X
+#undef HOST_c2l
+#undef HOST_l2c

diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
new file mode 100644
index 0000000..cd6becb
--- /dev/null
+++ b/crypto/fipsmodule/sha/sha256.c

@@ -0,0 +1,337 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#define SHA256_ASM
+#endif
+
+int SHA224_Init(SHA256_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
+  sha->h[0] = 0xc1059ed8UL;
+  sha->h[1] = 0x367cd507UL;
+  sha->h[2] = 0x3070dd17UL;
+  sha->h[3] = 0xf70e5939UL;
+  sha->h[4] = 0xffc00b31UL;
+  sha->h[5] = 0x68581511UL;
+  sha->h[6] = 0x64f98fa7UL;
+  sha->h[7] = 0xbefa4fa4UL;
+  sha->md_len = SHA224_DIGEST_LENGTH;
+  return 1;
+}
+
+int SHA256_Init(SHA256_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
+  sha->h[0] = 0x6a09e667UL;
+  sha->h[1] = 0xbb67ae85UL;
+  sha->h[2] = 0x3c6ef372UL;
+  sha->h[3] = 0xa54ff53aUL;
+  sha->h[4] = 0x510e527fUL;
+  sha->h[5] = 0x9b05688cUL;
+  sha->h[6] = 0x1f83d9abUL;
+  sha->h[7] = 0x5be0cd19UL;
+  sha->md_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+uint8_t *SHA224(const uint8_t *data, size_t len, uint8_t *out) {
+  SHA256_CTX ctx;
+  SHA224_Init(&ctx);
+  SHA224_Update(&ctx, data, len);
+  SHA224_Final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA256(const uint8_t *data, size_t len, uint8_t *out) {
+  SHA256_CTX ctx;
+  SHA256_Init(&ctx);
+  SHA256_Update(&ctx, data, len);
+  SHA256_Final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+int SHA224_Update(SHA256_CTX *ctx, const void *data, size_t len) {
+  return SHA256_Update(ctx, data, len);
+}
+
+int SHA224_Final(uint8_t *md, SHA256_CTX *ctx) {
+  return SHA256_Final(md, ctx);
+}
+
+#define DATA_ORDER_IS_BIG_ENDIAN
+
+#define HASH_CTX SHA256_CTX
+#define HASH_CBLOCK 64
+
+/* Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
+ * default: case below covers for it. It's not clear however if it's permitted
+ * to truncate to amount of bytes not divisible by 4. I bet not, but if it is,
+ * then default: case shall be extended. For reference. Idea behind separate
+ * cases for pre-defined lenghts is to let the compiler decide if it's
+ * appropriate to unroll small loops.
+ *
+ * TODO(davidben): The small |md_len| case is one of the few places a low-level
+ * hash 'final' function can fail. This should never happen. */
+#define HASH_MAKE_STRING(c, s)                              \
+  do {                                                      \
+    uint32_t ll;                                            \
+    unsigned int nn;                                        \
+    switch ((c)->md_len) {                                  \
+      case SHA224_DIGEST_LENGTH:                            \
+        for (nn = 0; nn < SHA224_DIGEST_LENGTH / 4; nn++) { \
+          ll = (c)->h[nn];                                  \
+          HOST_l2c(ll, (s));                                \
+        }                                                   \
+        break;                                              \
+      case SHA256_DIGEST_LENGTH:                            \
+        for (nn = 0; nn < SHA256_DIGEST_LENGTH / 4; nn++) { \
+          ll = (c)->h[nn];                                  \
+          HOST_l2c(ll, (s));                                \
+        }                                                   \
+        break;                                              \
+      default:                                              \
+        if ((c)->md_len > SHA256_DIGEST_LENGTH) {           \
+          return 0;                                         \
+        }                                                   \
+        for (nn = 0; nn < (c)->md_len / 4; nn++) {          \
+          ll = (c)->h[nn];                                  \
+          HOST_l2c(ll, (s));                                \
+        }                                                   \
+        break;                                              \
+    }                                                       \
+  } while (0)
+
+
+#define HASH_UPDATE SHA256_Update
+#define HASH_TRANSFORM SHA256_Transform
+#define HASH_FINAL SHA256_Final
+#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
+#ifndef SHA256_ASM
+static
+#endif
+void sha256_block_data_order(uint32_t *state, const uint8_t *in, size_t num);
+
+#include "../digest/md32_common.h"
+
+#ifndef SHA256_ASM
+static const uint32_t K256[64] = {
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
+    0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
+    0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
+    0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL,
+    0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL,
+    0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL,
+    0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL,
+    0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL,
+    0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL,
+    0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL};
+
+#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
+
+/* FIPS specification refers to right rotations, while our ROTATE macro
+ * is left one. This is why you might notice that rotation coefficients
+ * differ from those observed in FIPS document by 32-N... */
+#define Sigma0(x) (ROTATE((x), 30) ^ ROTATE((x), 19) ^ ROTATE((x), 10))
+#define Sigma1(x) (ROTATE((x), 26) ^ ROTATE((x), 21) ^ ROTATE((x), 7))
+#define sigma0(x) (ROTATE((x), 25) ^ ROTATE((x), 14) ^ ((x) >> 3))
+#define sigma1(x) (ROTATE((x), 15) ^ ROTATE((x), 13) ^ ((x) >> 10))
+
+#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#define ROUND_00_15(i, a, b, c, d, e, f, g, h)   \
+  do {                                           \
+    T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; \
+    h = Sigma0(a) + Maj(a, b, c);                \
+    d += T1;                                     \
+    h += T1;                                     \
+  } while (0)
+
+#define ROUND_16_63(i, a, b, c, d, e, f, g, h, X)      \
+  do {                                                 \
+    s0 = X[(i + 1) & 0x0f];                            \
+    s0 = sigma0(s0);                                   \
+    s1 = X[(i + 14) & 0x0f];                           \
+    s1 = sigma1(s1);                                   \
+    T1 = X[(i) & 0x0f] += s0 + s1 + X[(i + 9) & 0x0f]; \
+    ROUND_00_15(i, a, b, c, d, e, f, g, h);            \
+  } while (0)
+
+static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
+                                    size_t num) {
+  uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
+  uint32_t X[16];
+  int i;
+
+  while (num--) {
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
+
+    uint32_t l;
+
+    HOST_c2l(data, l);
+    T1 = X[0] = l;
+    ROUND_00_15(0, a, b, c, d, e, f, g, h);
+    HOST_c2l(data, l);
+    T1 = X[1] = l;
+    ROUND_00_15(1, h, a, b, c, d, e, f, g);
+    HOST_c2l(data, l);
+    T1 = X[2] = l;
+    ROUND_00_15(2, g, h, a, b, c, d, e, f);
+    HOST_c2l(data, l);
+    T1 = X[3] = l;
+    ROUND_00_15(3, f, g, h, a, b, c, d, e);
+    HOST_c2l(data, l);
+    T1 = X[4] = l;
+    ROUND_00_15(4, e, f, g, h, a, b, c, d);
+    HOST_c2l(data, l);
+    T1 = X[5] = l;
+    ROUND_00_15(5, d, e, f, g, h, a, b, c);
+    HOST_c2l(data, l);
+    T1 = X[6] = l;
+    ROUND_00_15(6, c, d, e, f, g, h, a, b);
+    HOST_c2l(data, l);
+    T1 = X[7] = l;
+    ROUND_00_15(7, b, c, d, e, f, g, h, a);
+    HOST_c2l(data, l);
+    T1 = X[8] = l;
+    ROUND_00_15(8, a, b, c, d, e, f, g, h);
+    HOST_c2l(data, l);
+    T1 = X[9] = l;
+    ROUND_00_15(9, h, a, b, c, d, e, f, g);
+    HOST_c2l(data, l);
+    T1 = X[10] = l;
+    ROUND_00_15(10, g, h, a, b, c, d, e, f);
+    HOST_c2l(data, l);
+    T1 = X[11] = l;
+    ROUND_00_15(11, f, g, h, a, b, c, d, e);
+    HOST_c2l(data, l);
+    T1 = X[12] = l;
+    ROUND_00_15(12, e, f, g, h, a, b, c, d);
+    HOST_c2l(data, l);
+    T1 = X[13] = l;
+    ROUND_00_15(13, d, e, f, g, h, a, b, c);
+    HOST_c2l(data, l);
+    T1 = X[14] = l;
+    ROUND_00_15(14, c, d, e, f, g, h, a, b);
+    HOST_c2l(data, l);
+    T1 = X[15] = l;
+    ROUND_00_15(15, b, c, d, e, f, g, h, a);
+
+    for (i = 16; i < 64; i += 8) {
+      ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X);
+      ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X);
+      ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X);
+      ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X);
+      ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X);
+      ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X);
+      ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X);
+      ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X);
+    }
+
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
+  }
+}
+
+#endif /* !SHA256_ASM */
+
+#undef DATA_ORDER_IS_BIG_ENDIAN
+#undef HASH_CTX
+#undef HASH_CBLOCK
+#undef HASH_MAKE_STRING
+#undef HASH_UPDATE
+#undef HASH_TRANSFORM
+#undef HASH_FINAL
+#undef HASH_BLOCK_DATA_ORDER
+#undef ROTATE
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+#undef ROUND_00_15
+#undef ROUND_16_63
+#undef HOST_c2l
+#undef HOST_l2c

diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
new file mode 100644
index 0000000..6e1f79b
--- /dev/null
+++ b/crypto/fipsmodule/sha/sha512.c

@@ -0,0 +1,610 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+/* IMPLEMENTATION NOTES.
+ *
+ * The 32-bit hash algorithms share a common byte-order neutral collector and
+ * padding function implementations that operate on unaligned data,
+ * ../md32_common.h. This SHA-512 implementation does not. Reasons
+ * [in reverse order] are:
+ *
+ * - It's the only 64-bit hash algorithm for the moment of this writing,
+ *   there is no need for common collector/padding implementation [yet];
+ * - By supporting only a transform function that operates on *aligned* data
+ *   the collector/padding function is simpler and easier to optimize. */
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#define SHA512_ASM
+#endif
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+    defined(__ARM_FEATURE_UNALIGNED)
+#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#endif
+
+int SHA384_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8);
+  sha->h[1] = UINT64_C(0x629a292a367cd507);
+  sha->h[2] = UINT64_C(0x9159015a3070dd17);
+  sha->h[3] = UINT64_C(0x152fecd8f70e5939);
+  sha->h[4] = UINT64_C(0x67332667ffc00b31);
+  sha->h[5] = UINT64_C(0x8eb44a8768581511);
+  sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7);
+  sha->h[7] = UINT64_C(0x47b5481dbefa4fa4);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA384_DIGEST_LENGTH;
+  return 1;
+}
+
+
+int SHA512_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0x6a09e667f3bcc908);
+  sha->h[1] = UINT64_C(0xbb67ae8584caa73b);
+  sha->h[2] = UINT64_C(0x3c6ef372fe94f82b);
+  sha->h[3] = UINT64_C(0xa54ff53a5f1d36f1);
+  sha->h[4] = UINT64_C(0x510e527fade682d1);
+  sha->h[5] = UINT64_C(0x9b05688c2b3e6c1f);
+  sha->h[6] = UINT64_C(0x1f83d9abfb41bd6b);
+  sha->h[7] = UINT64_C(0x5be0cd19137e2179);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA512_DIGEST_LENGTH;
+  return 1;
+}
+
+uint8_t *SHA384(const uint8_t *data, size_t len, uint8_t *out) {
+  SHA512_CTX ctx;
+  SHA384_Init(&ctx);
+  SHA384_Update(&ctx, data, len);
+  SHA384_Final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA512(const uint8_t *data, size_t len, uint8_t *out) {
+  SHA512_CTX ctx;
+  SHA512_Init(&ctx);
+  SHA512_Update(&ctx, data, len);
+  SHA512_Final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+#if !defined(SHA512_ASM)
+static
+#endif
+void sha512_block_data_order(uint64_t *state, const uint64_t *W, size_t num);
+
+
+int SHA384_Final(uint8_t *md, SHA512_CTX *sha) {
+  return SHA512_Final(md, sha);
+}
+
+int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) {
+  return SHA512_Update(sha, data, len);
+}
+
+void SHA512_Transform(SHA512_CTX *c, const uint8_t *block) {
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+  if ((size_t)block % sizeof(c->u.d[0]) != 0) {
+    OPENSSL_memcpy(c->u.p, block, sizeof(c->u.p));
+    block = c->u.p;
+  }
+#endif
+  sha512_block_data_order(c->h, (uint64_t *)block, 1);
+}
+
+int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
+  uint64_t l;
+  uint8_t *p = c->u.p;
+  const uint8_t *data = (const uint8_t *)in_data;
+
+  if (len == 0) {
+    return 1;
+  }
+
+  l = (c->Nl + (((uint64_t)len) << 3)) & UINT64_C(0xffffffffffffffff);
+  if (l < c->Nl) {
+    c->Nh++;
+  }
+  if (sizeof(len) >= 8) {
+    c->Nh += (((uint64_t)len) >> 61);
+  }
+  c->Nl = l;
+
+  if (c->num != 0) {
+    size_t n = sizeof(c->u) - c->num;
+
+    if (len < n) {
+      OPENSSL_memcpy(p + c->num, data, len);
+      c->num += (unsigned int)len;
+      return 1;
+    } else {
+      OPENSSL_memcpy(p + c->num, data, n), c->num = 0;
+      len -= n;
+      data += n;
+      sha512_block_data_order(c->h, (uint64_t *)p, 1);
+    }
+  }
+
+  if (len >= sizeof(c->u)) {
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+    if ((size_t)data % sizeof(c->u.d[0]) != 0) {
+      while (len >= sizeof(c->u)) {
+        OPENSSL_memcpy(p, data, sizeof(c->u));
+        sha512_block_data_order(c->h, (uint64_t *)p, 1);
+        len -= sizeof(c->u);
+        data += sizeof(c->u);
+      }
+    } else
+#endif
+    {
+      sha512_block_data_order(c->h, (uint64_t *)data, len / sizeof(c->u));
+      data += len;
+      len %= sizeof(c->u);
+      data -= len;
+    }
+  }
+
+  if (len != 0) {
+    OPENSSL_memcpy(p, data, len);
+    c->num = (int)len;
+  }
+
+  return 1;
+}
+
+int SHA512_Final(uint8_t *md, SHA512_CTX *sha) {
+  uint8_t *p = (uint8_t *)sha->u.p;
+  size_t n = sha->num;
+
+  p[n] = 0x80; /* There always is a room for one */
+  n++;
+  if (n > (sizeof(sha->u) - 16)) {
+    OPENSSL_memset(p + n, 0, sizeof(sha->u) - n);
+    n = 0;
+    sha512_block_data_order(sha->h, (uint64_t *)p, 1);
+  }
+
+  OPENSSL_memset(p + n, 0, sizeof(sha->u) - 16 - n);
+  p[sizeof(sha->u) - 1] = (uint8_t)(sha->Nl);
+  p[sizeof(sha->u) - 2] = (uint8_t)(sha->Nl >> 8);
+  p[sizeof(sha->u) - 3] = (uint8_t)(sha->Nl >> 16);
+  p[sizeof(sha->u) - 4] = (uint8_t)(sha->Nl >> 24);
+  p[sizeof(sha->u) - 5] = (uint8_t)(sha->Nl >> 32);
+  p[sizeof(sha->u) - 6] = (uint8_t)(sha->Nl >> 40);
+  p[sizeof(sha->u) - 7] = (uint8_t)(sha->Nl >> 48);
+  p[sizeof(sha->u) - 8] = (uint8_t)(sha->Nl >> 56);
+  p[sizeof(sha->u) - 9] = (uint8_t)(sha->Nh);
+  p[sizeof(sha->u) - 10] = (uint8_t)(sha->Nh >> 8);
+  p[sizeof(sha->u) - 11] = (uint8_t)(sha->Nh >> 16);
+  p[sizeof(sha->u) - 12] = (uint8_t)(sha->Nh >> 24);
+  p[sizeof(sha->u) - 13] = (uint8_t)(sha->Nh >> 32);
+  p[sizeof(sha->u) - 14] = (uint8_t)(sha->Nh >> 40);
+  p[sizeof(sha->u) - 15] = (uint8_t)(sha->Nh >> 48);
+  p[sizeof(sha->u) - 16] = (uint8_t)(sha->Nh >> 56);
+
+  sha512_block_data_order(sha->h, (uint64_t *)p, 1);
+
+  if (md == NULL) {
+    /* TODO(davidben): This NULL check is absent in other low-level hash 'final'
+     * functions and is one of the few places one can fail. */
+    return 0;
+  }
+
+  switch (sha->md_len) {
+    /* Let compiler decide if it's appropriate to unroll... */
+    case SHA384_DIGEST_LENGTH:
+      for (n = 0; n < SHA384_DIGEST_LENGTH / 8; n++) {
+        uint64_t t = sha->h[n];
+
+        *(md++) = (uint8_t)(t >> 56);
+        *(md++) = (uint8_t)(t >> 48);
+        *(md++) = (uint8_t)(t >> 40);
+        *(md++) = (uint8_t)(t >> 32);
+        *(md++) = (uint8_t)(t >> 24);
+        *(md++) = (uint8_t)(t >> 16);
+        *(md++) = (uint8_t)(t >> 8);
+        *(md++) = (uint8_t)(t);
+      }
+      break;
+    case SHA512_DIGEST_LENGTH:
+      for (n = 0; n < SHA512_DIGEST_LENGTH / 8; n++) {
+        uint64_t t = sha->h[n];
+
+        *(md++) = (uint8_t)(t >> 56);
+        *(md++) = (uint8_t)(t >> 48);
+        *(md++) = (uint8_t)(t >> 40);
+        *(md++) = (uint8_t)(t >> 32);
+        *(md++) = (uint8_t)(t >> 24);
+        *(md++) = (uint8_t)(t >> 16);
+        *(md++) = (uint8_t)(t >> 8);
+        *(md++) = (uint8_t)(t);
+      }
+      break;
+    /* ... as well as make sure md_len is not abused. */
+    default:
+      /* TODO(davidben): This bad |md_len| case is one of the few places a
+       * low-level hash 'final' function can fail. This should never happen. */
+      return 0;
+  }
+
+  return 1;
+}
+
+#ifndef SHA512_ASM
+static const uint64_t K512[80] = {
+    UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd),
+    UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc),
+    UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019),
+    UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118),
+    UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe),
+    UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2),
+    UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1),
+    UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694),
+    UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3),
+    UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65),
+    UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483),
+    UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5),
+    UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210),
+    UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4),
+    UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725),
+    UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70),
+    UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926),
+    UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df),
+    UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8),
+    UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b),
+    UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001),
+    UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30),
+    UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910),
+    UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8),
+    UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53),
+    UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8),
+    UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb),
+    UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3),
+    UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60),
+    UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec),
+    UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9),
+    UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b),
+    UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207),
+    UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178),
+    UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6),
+    UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b),
+    UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493),
+    UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c),
+    UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a),
+    UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817),
+};
+
+#if defined(__GNUC__) && __GNUC__ >= 2 && !defined(OPENSSL_NO_ASM)
+#if defined(__x86_64) || defined(__x86_64__)
+#define ROTR(a, n)                                              \
+  ({                                                            \
+    uint64_t ret;                                               \
+    __asm__("rorq %1, %0" : "=r"(ret) : "J"(n), "0"(a) : "cc"); \
+    ret;                                                        \
+  })
+#define PULL64(x)                                \
+  ({                                             \
+    uint64_t ret = *((const uint64_t *)(&(x)));  \
+    __asm__("bswapq %0" : "=r"(ret) : "0"(ret)); \
+    ret;                                         \
+  })
+#elif(defined(__i386) || defined(__i386__))
+#define PULL64(x)                                                             \
+  ({                                                                          \
+    const unsigned int *p = (const unsigned int *)(&(x));                     \
+    unsigned int hi = p[0], lo = p[1];                                        \
+    __asm__("bswapl %0; bswapl %1;" : "=r"(lo), "=r"(hi) : "0"(lo), "1"(hi)); \
+    ((uint64_t)hi) << 32 | lo;                                                \
+  })
+#elif(defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
+#define ROTR(a, n)                                             \
+  ({                                                           \
+    uint64_t ret;                                              \
+    __asm__("rotrdi %0, %1, %2" : "=r"(ret) : "r"(a), "K"(n)); \
+    ret;                                                       \
+  })
+#elif defined(__aarch64__)
+#define ROTR(a, n)                                          \
+  ({                                                        \
+    uint64_t ret;                                           \
+    __asm__("ror %0, %1, %2" : "=r"(ret) : "r"(a), "I"(n)); \
+    ret;                                                    \
+  })
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define PULL64(x)                                                         \
+  ({                                                                      \
+    uint64_t ret;                                                         \
+    __asm__("rev %0, %1" : "=r"(ret) : "r"(*((const uint64_t *)(&(x))))); \
+    ret;                                                                  \
+  })
+#endif
+#endif
+#elif defined(_MSC_VER)
+#if defined(_WIN64) /* applies to both IA-64 and AMD64 */
+#pragma intrinsic(_rotr64)
+#define ROTR(a, n) _rotr64((a), n)
+#endif
+#if defined(_M_IX86) && !defined(OPENSSL_NO_ASM)
+static uint64_t __fastcall __pull64be(const void *x) {
+  _asm mov edx, [ecx + 0]
+  _asm mov eax, [ecx + 4]
+  _asm bswap edx
+  _asm bswap eax
+}
+#define PULL64(x) __pull64be(&(x))
+#if _MSC_VER <= 1200
+#pragma inline_depth(0)
+#endif
+#endif
+#endif
+
+#ifndef PULL64
+#define B(x, j) \
+  (((uint64_t)(*(((const uint8_t *)(&x)) + j))) << ((7 - j) * 8))
+#define PULL64(x)                                                        \
+  (B(x, 0) | B(x, 1) | B(x, 2) | B(x, 3) | B(x, 4) | B(x, 5) | B(x, 6) | \
+   B(x, 7))
+#endif
+
+#ifndef ROTR
+#define ROTR(x, s) (((x) >> s) | (x) << (64 - s))
+#endif
+
+#define Sigma0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
+#define Sigma1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
+#define sigma0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ ((x) >> 7))
+#define sigma1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ ((x) >> 6))
+
+#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+/*
+ * This code should give better results on 32-bit CPU with less than
+ * ~24 registers, both size and performance wise...
+ */
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
+                                    size_t num) {
+  uint64_t A, E, T;
+  uint64_t X[9 + 80], *F;
+  int i;
+
+  while (num--) {
+    F = X + 80;
+    A = state[0];
+    F[1] = state[1];
+    F[2] = state[2];
+    F[3] = state[3];
+    E = state[4];
+    F[5] = state[5];
+    F[6] = state[6];
+    F[7] = state[7];
+
+    for (i = 0; i < 16; i++, F--) {
+      T = PULL64(W[i]);
+      F[0] = A;
+      F[4] = E;
+      F[8] = T;
+      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
+      E = F[3] + T;
+      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
+    }
+
+    for (; i < 80; i++, F--) {
+      T = sigma0(F[8 + 16 - 1]);
+      T += sigma1(F[8 + 16 - 14]);
+      T += F[8 + 16] + F[8 + 16 - 9];
+
+      F[0] = A;
+      F[4] = E;
+      F[8] = T;
+      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
+      E = F[3] + T;
+      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
+    }
+
+    state[0] += A;
+    state[1] += F[1];
+    state[2] += F[2];
+    state[3] += F[3];
+    state[4] += E;
+    state[5] += F[5];
+    state[6] += F[6];
+    state[7] += F[7];
+
+    W += 16;
+  }
+}
+
+#else
+
+#define ROUND_00_15(i, a, b, c, d, e, f, g, h)   \
+  do {                                           \
+    T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \
+    h = Sigma0(a) + Maj(a, b, c);                \
+    d += T1;                                     \
+    h += T1;                                     \
+  } while (0)
+
+#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X)   \
+  do {                                                 \
+    s0 = X[(j + 1) & 0x0f];                            \
+    s0 = sigma0(s0);                                   \
+    s1 = X[(j + 14) & 0x0f];                           \
+    s1 = sigma1(s1);                                   \
+    T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \
+    ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
+  } while (0)
+
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
+                                    size_t num) {
+  uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
+  uint64_t X[16];
+  int i;
+
+  while (num--) {
+
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
+
+    T1 = X[0] = PULL64(W[0]);
+    ROUND_00_15(0, a, b, c, d, e, f, g, h);
+    T1 = X[1] = PULL64(W[1]);
+    ROUND_00_15(1, h, a, b, c, d, e, f, g);
+    T1 = X[2] = PULL64(W[2]);
+    ROUND_00_15(2, g, h, a, b, c, d, e, f);
+    T1 = X[3] = PULL64(W[3]);
+    ROUND_00_15(3, f, g, h, a, b, c, d, e);
+    T1 = X[4] = PULL64(W[4]);
+    ROUND_00_15(4, e, f, g, h, a, b, c, d);
+    T1 = X[5] = PULL64(W[5]);
+    ROUND_00_15(5, d, e, f, g, h, a, b, c);
+    T1 = X[6] = PULL64(W[6]);
+    ROUND_00_15(6, c, d, e, f, g, h, a, b);
+    T1 = X[7] = PULL64(W[7]);
+    ROUND_00_15(7, b, c, d, e, f, g, h, a);
+    T1 = X[8] = PULL64(W[8]);
+    ROUND_00_15(8, a, b, c, d, e, f, g, h);
+    T1 = X[9] = PULL64(W[9]);
+    ROUND_00_15(9, h, a, b, c, d, e, f, g);
+    T1 = X[10] = PULL64(W[10]);
+    ROUND_00_15(10, g, h, a, b, c, d, e, f);
+    T1 = X[11] = PULL64(W[11]);
+    ROUND_00_15(11, f, g, h, a, b, c, d, e);
+    T1 = X[12] = PULL64(W[12]);
+    ROUND_00_15(12, e, f, g, h, a, b, c, d);
+    T1 = X[13] = PULL64(W[13]);
+    ROUND_00_15(13, d, e, f, g, h, a, b, c);
+    T1 = X[14] = PULL64(W[14]);
+    ROUND_00_15(14, c, d, e, f, g, h, a, b);
+    T1 = X[15] = PULL64(W[15]);
+    ROUND_00_15(15, b, c, d, e, f, g, h, a);
+
+    for (i = 16; i < 80; i += 16) {
+      ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
+      ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X);
+      ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X);
+      ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X);
+      ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X);
+      ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X);
+      ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X);
+      ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X);
+      ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X);
+      ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X);
+      ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X);
+      ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X);
+      ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X);
+      ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X);
+      ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X);
+      ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
+    }
+
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
+
+    W += 16;
+  }
+}
+
+#endif
+
+#endif /* !SHA512_ASM */
+
+#undef ROTR
+#undef PULL64
+#undef B
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+#undef ROUND_00_15
+#undef ROUND_16_80
+#undef HOST_c2l
+#undef HOST_l2c
commit	fd49993c3b94aef54669e095f1e6c417efb202aa	[log] [tgz]
author	Adam Langley <agl@google.com>	Tue Apr 04 14:21:43 2017 -0700
committer	Adam Langley <agl@google.com>	Fri Apr 07 00:05:34 2017 +0000
tree	e1654103a8cb187ad9a2bb05bc5dbfc5fc54d846
parent	0ef8c7bd8fe1bf50ee18c672998625b7e54f6456 [diff]