Reland bitsliced aes_nohw implementation.

This relands
https://boringssl-review.googlesource.com/c/boringssl/+/39206. See that
CL description for details on the change.

The CL was originally reverted due to a number of ARM-only test
failures. First, there was a test-only issue, resolved in
https://boringssl-review.googlesource.com/c/boringssl/+/39306.

Second, the implementation did not work in unoptimized Android Thumb2
builds. This was caused by a clang bug introduced in
https://reviews.llvm.org/rL340261 and fixed in
https://reviews.llvm.org/rL351310. aes_nohw_(un)compact_block have
been rewritten in an attempt to dodge the bug. Performance of optimized
builds with clang and gcc do not appear to be affected by the rewrite.
See the delta from patch set 1.

(I had hoped to improve precommit CQ coverage before landing this, but
both failures turned out to be ARM-only. Either way, there are now
32-bit and 64-bit SSE2-less configurations so the 32-bit and 64-bit
implementations have CQ coverage.)

Change-Id: If5f9f5ea570686a15258ecd7cf49bdbc12dc34c5
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/39444
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/cipher_extra/e_aesgcmsiv.c b/crypto/cipher_extra/e_aesgcmsiv.c
index 64febae..d717572 100644
--- a/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/crypto/cipher_extra/e_aesgcmsiv.c
@@ -723,6 +723,14 @@
   }
 
   OPENSSL_memcpy(out_keys->auth_key, key_material, 16);
+  // Note the |ctr128_f| function uses a big-endian couner, while AES-GCM-SIV
+  // uses a little-endian counter. We ignore the return value and only use
+  // |block128_f|. This has a significant performance cost for the fallback
+  // bitsliced AES implementations (bsaes and aes_nohw).
+  //
+  // We currently do not consider AES-GCM-SIV to be performance-sensitive on
+  // client hardware. If this changes, we can write little-endian |ctr128_f|
+  // functions.
   aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block,
                   key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16);
 }
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 3081a41..a675fbd 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -6,7 +6,6 @@
 
     aesni-gcm-x86_64.${ASM_EXT}
     aesni-x86_64.${ASM_EXT}
-    aes-x86_64.${ASM_EXT}
     ghash-ssse3-x86_64.${ASM_EXT}
     ghash-x86_64.${ASM_EXT}
     md5-x86_64.${ASM_EXT}
@@ -27,7 +26,6 @@
   set(
     BCM_ASM_SOURCES
 
-    aes-586.${ASM_EXT}
     aesni-x86.${ASM_EXT}
     bn-586.${ASM_EXT}
     co-586.${ASM_EXT}
@@ -46,7 +44,6 @@
   set(
     BCM_ASM_SOURCES
 
-    aes-armv4.${ASM_EXT}
     aesv8-armx.${ASM_EXT}
     armv4-mont.${ASM_EXT}
     bsaes-armv7.${ASM_EXT}
@@ -83,14 +80,11 @@
   )
 endif()
 
-perlasm(aes-586.${ASM_EXT} aes/asm/aes-586.pl)
-perlasm(aes-armv4.${ASM_EXT} aes/asm/aes-armv4.pl)
 perlasm(aesni-gcm-x86_64.${ASM_EXT} modes/asm/aesni-gcm-x86_64.pl)
 perlasm(aesni-x86_64.${ASM_EXT} aes/asm/aesni-x86_64.pl)
 perlasm(aesni-x86.${ASM_EXT} aes/asm/aesni-x86.pl)
 perlasm(aesp8-ppc.${ASM_EXT} aes/asm/aesp8-ppc.pl)
 perlasm(aesv8-armx.${ASM_EXT} aes/asm/aesv8-armx.pl)
-perlasm(aes-x86_64.${ASM_EXT} aes/asm/aes-x86_64.pl)
 perlasm(armv4-mont.${ASM_EXT} bn/asm/armv4-mont.pl)
 perlasm(armv8-mont.${ASM_EXT} bn/asm/armv8-mont.pl)
 perlasm(bn-586.${ASM_EXT} bn/asm/bn-586.pl)
diff --git a/crypto/fipsmodule/aes/aes.c b/crypto/fipsmodule/aes/aes.c
index 48d60ee..f60281d 100644
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@@ -56,758 +56,6 @@
 #include "../modes/internal.h"
 
 
-#if defined(OPENSSL_NO_ASM) || \
-    (!defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) && !defined(OPENSSL_ARM))
-
-// Te0[x] = S [x].[02, 01, 01, 03];
-// Te1[x] = S [x].[03, 02, 01, 01];
-// Te2[x] = S [x].[01, 03, 02, 01];
-// Te3[x] = S [x].[01, 01, 03, 02];
-//
-// Td0[x] = Si[x].[0e, 09, 0d, 0b];
-// Td1[x] = Si[x].[0b, 0e, 09, 0d];
-// Td2[x] = Si[x].[0d, 0b, 0e, 09];
-// Td3[x] = Si[x].[09, 0d, 0b, 0e];
-// Td4[x] = Si[x].[01];
-
-static const uint32_t Te0[256] = {
-    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, 0xfff2f20dU,
-    0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, 0x60303050U, 0x02010103U,
-    0xce6767a9U, 0x562b2b7dU, 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U,
-    0xec76769aU, 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
-    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, 0x41adadecU,
-    0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, 0x239c9cbfU, 0x53a4a4f7U,
-    0xe4727296U, 0x9bc0c05bU, 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU,
-    0x4c26266aU, 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
-    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, 0xe2717193U,
-    0xabd8d873U, 0x62313153U, 0x2a15153fU, 0x0804040cU, 0x95c7c752U,
-    0x46232365U, 0x9dc3c35eU, 0x30181828U, 0x379696a1U, 0x0a05050fU,
-    0x2f9a9ab5U, 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
-    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, 0x1209091bU,
-    0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, 0x361b1b2dU, 0xdc6e6eb2U,
-    0xb45a5aeeU, 0x5ba0a0fbU, 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U,
-    0x7db3b3ceU, 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
-    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, 0x40202060U,
-    0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, 0xd46a6abeU, 0x8dcbcb46U,
-    0x67bebed9U, 0x7239394bU, 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U,
-    0x85cfcf4aU, 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
-    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, 0x8a4545cfU,
-    0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, 0xa05050f0U, 0x783c3c44U,
-    0x259f9fbaU, 0x4ba8a8e3U, 0xa25151f3U, 0x5da3a3feU, 0x804040c0U,
-    0x058f8f8aU, 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
-    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, 0x20101030U,
-    0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, 0x81cdcd4cU, 0x180c0c14U,
-    0x26131335U, 0xc3ecec2fU, 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU,
-    0x2e171739U, 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
-    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, 0xc06060a0U,
-    0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, 0x44222266U, 0x542a2a7eU,
-    0x3b9090abU, 0x0b888883U, 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U,
-    0x2814143cU, 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
-    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, 0x924949dbU,
-    0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, 0x9fc2c25dU, 0xbdd3d36eU,
-    0x43acacefU, 0xc46262a6U, 0x399191a8U, 0x319595a4U, 0xd3e4e437U,
-    0xf279798bU, 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
-    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, 0xd86c6cb4U,
-    0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, 0xca6565afU, 0xf47a7a8eU,
-    0x47aeaee9U, 0x10080818U, 0x6fbabad5U, 0xf0787888U, 0x4a25256fU,
-    0x5c2e2e72U, 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
-    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, 0x964b4bddU,
-    0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, 0xe0707090U, 0x7c3e3e42U,
-    0x71b5b5c4U, 0xcc6666aaU, 0x904848d8U, 0x06030305U, 0xf7f6f601U,
-    0x1c0e0e12U, 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
-    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, 0xd9e1e138U,
-    0xebf8f813U, 0x2b9898b3U, 0x22111133U, 0xd26969bbU, 0xa9d9d970U,
-    0x078e8e89U, 0x339494a7U, 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U,
-    0xc9e9e920U, 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
-    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, 0x65bfbfdaU,
-    0xd7e6e631U, 0x844242c6U, 0xd06868b8U, 0x824141c3U, 0x299999b0U,
-    0x5a2d2d77U, 0x1e0f0f11U, 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U,
-    0x2c16163aU, };
-
-static const uint32_t Te1[256] = {
-    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, 0x0dfff2f2U,
-    0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, 0x50603030U, 0x03020101U,
-    0xa9ce6767U, 0x7d562b2bU, 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU,
-    0x9aec7676U, 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
-    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, 0xec41adadU,
-    0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, 0xbf239c9cU, 0xf753a4a4U,
-    0x96e47272U, 0x5b9bc0c0U, 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U,
-    0x6a4c2626U, 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
-    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, 0x93e27171U,
-    0x73abd8d8U, 0x53623131U, 0x3f2a1515U, 0x0c080404U, 0x5295c7c7U,
-    0x65462323U, 0x5e9dc3c3U, 0x28301818U, 0xa1379696U, 0x0f0a0505U,
-    0xb52f9a9aU, 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
-    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, 0x1b120909U,
-    0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, 0x2d361b1bU, 0xb2dc6e6eU,
-    0xeeb45a5aU, 0xfb5ba0a0U, 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U,
-    0xce7db3b3U, 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
-    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, 0x60402020U,
-    0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, 0xbed46a6aU, 0x468dcbcbU,
-    0xd967bebeU, 0x4b723939U, 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U,
-    0x4a85cfcfU, 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
-    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, 0xcf8a4545U,
-    0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, 0xf0a05050U, 0x44783c3cU,
-    0xba259f9fU, 0xe34ba8a8U, 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U,
-    0x8a058f8fU, 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
-    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, 0x30201010U,
-    0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, 0x4c81cdcdU, 0x14180c0cU,
-    0x35261313U, 0x2fc3ececU, 0xe1be5f5fU, 0xa2359797U, 0xcc884444U,
-    0x392e1717U, 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
-    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, 0xa0c06060U,
-    0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, 0x66442222U, 0x7e542a2aU,
-    0xab3b9090U, 0x830b8888U, 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U,
-    0x3c281414U, 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
-    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, 0xdb924949U,
-    0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, 0x5d9fc2c2U, 0x6ebdd3d3U,
-    0xef43acacU, 0xa6c46262U, 0xa8399191U, 0xa4319595U, 0x37d3e4e4U,
-    0x8bf27979U, 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
-    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, 0xb4d86c6cU,
-    0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, 0xafca6565U, 0x8ef47a7aU,
-    0xe947aeaeU, 0x18100808U, 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U,
-    0x725c2e2eU, 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
-    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, 0xdd964b4bU,
-    0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, 0x90e07070U, 0x427c3e3eU,
-    0xc471b5b5U, 0xaacc6666U, 0xd8904848U, 0x05060303U, 0x01f7f6f6U,
-    0x121c0e0eU, 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
-    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, 0x38d9e1e1U,
-    0x13ebf8f8U, 0xb32b9898U, 0x33221111U, 0xbbd26969U, 0x70a9d9d9U,
-    0x89078e8eU, 0xa7339494U, 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U,
-    0x20c9e9e9U, 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
-    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, 0xda65bfbfU,
-    0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, 0xc3824141U, 0xb0299999U,
-    0x775a2d2dU, 0x111e0f0fU, 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU,
-    0x3a2c1616U, };
-
-static const uint32_t Te2[256] = {
-    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, 0xf20dfff2U,
-    0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, 0x30506030U, 0x01030201U,
-    0x67a9ce67U, 0x2b7d562bU, 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU,
-    0x769aec76U, 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
-    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, 0xadec41adU,
-    0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, 0x9cbf239cU, 0xa4f753a4U,
-    0x7296e472U, 0xc05b9bc0U, 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U,
-    0x266a4c26U, 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
-    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, 0x7193e271U,
-    0xd873abd8U, 0x31536231U, 0x153f2a15U, 0x040c0804U, 0xc75295c7U,
-    0x23654623U, 0xc35e9dc3U, 0x18283018U, 0x96a13796U, 0x050f0a05U,
-    0x9ab52f9aU, 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
-    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, 0x091b1209U,
-    0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, 0x1b2d361bU, 0x6eb2dc6eU,
-    0x5aeeb45aU, 0xa0fb5ba0U, 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U,
-    0xb3ce7db3U, 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
-    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, 0x20604020U,
-    0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, 0x6abed46aU, 0xcb468dcbU,
-    0xbed967beU, 0x394b7239U, 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U,
-    0xcf4a85cfU, 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
-    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, 0x45cf8a45U,
-    0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, 0x50f0a050U, 0x3c44783cU,
-    0x9fba259fU, 0xa8e34ba8U, 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U,
-    0x8f8a058fU, 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
-    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, 0x10302010U,
-    0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, 0xcd4c81cdU, 0x0c14180cU,
-    0x13352613U, 0xec2fc3ecU, 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U,
-    0x17392e17U, 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
-    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, 0x60a0c060U,
-    0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, 0x22664422U, 0x2a7e542aU,
-    0x90ab3b90U, 0x88830b88U, 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U,
-    0x143c2814U, 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
-    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, 0x49db9249U,
-    0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, 0xc25d9fc2U, 0xd36ebdd3U,
-    0xacef43acU, 0x62a6c462U, 0x91a83991U, 0x95a43195U, 0xe437d3e4U,
-    0x798bf279U, 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
-    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, 0x6cb4d86cU,
-    0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, 0x65afca65U, 0x7a8ef47aU,
-    0xaee947aeU, 0x08181008U, 0xbad56fbaU, 0x7888f078U, 0x256f4a25U,
-    0x2e725c2eU, 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
-    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, 0x4bdd964bU,
-    0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, 0x7090e070U, 0x3e427c3eU,
-    0xb5c471b5U, 0x66aacc66U, 0x48d89048U, 0x03050603U, 0xf601f7f6U,
-    0x0e121c0eU, 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
-    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, 0xe138d9e1U,
-    0xf813ebf8U, 0x98b32b98U, 0x11332211U, 0x69bbd269U, 0xd970a9d9U,
-    0x8e89078eU, 0x94a73394U, 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U,
-    0xe920c9e9U, 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
-    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, 0xbfda65bfU,
-    0xe631d7e6U, 0x42c68442U, 0x68b8d068U, 0x41c38241U, 0x99b02999U,
-    0x2d775a2dU, 0x0f111e0fU, 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU,
-    0x163a2c16U, };
-
-static const uint32_t Te3[256] = {
-    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, 0xf2f20dffU,
-    0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, 0x30305060U, 0x01010302U,
-    0x6767a9ceU, 0x2b2b7d56U, 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU,
-    0x76769aecU, 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
-    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, 0xadadec41U,
-    0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, 0x9c9cbf23U, 0xa4a4f753U,
-    0x727296e4U, 0xc0c05b9bU, 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU,
-    0x26266a4cU, 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
-    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, 0x717193e2U,
-    0xd8d873abU, 0x31315362U, 0x15153f2aU, 0x04040c08U, 0xc7c75295U,
-    0x23236546U, 0xc3c35e9dU, 0x18182830U, 0x9696a137U, 0x05050f0aU,
-    0x9a9ab52fU, 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
-    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, 0x09091b12U,
-    0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, 0x1b1b2d36U, 0x6e6eb2dcU,
-    0x5a5aeeb4U, 0xa0a0fb5bU, 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U,
-    0xb3b3ce7dU, 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
-    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, 0x20206040U,
-    0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, 0x6a6abed4U, 0xcbcb468dU,
-    0xbebed967U, 0x39394b72U, 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U,
-    0xcfcf4a85U, 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
-    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, 0x4545cf8aU,
-    0xf9f910e9U, 0x02020604U, 0x7f7f81feU, 0x5050f0a0U, 0x3c3c4478U,
-    0x9f9fba25U, 0xa8a8e34bU, 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U,
-    0x8f8f8a05U, 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
-    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, 0x10103020U,
-    0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, 0xcdcd4c81U, 0x0c0c1418U,
-    0x13133526U, 0xecec2fc3U, 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U,
-    0x1717392eU, 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
-    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, 0x6060a0c0U,
-    0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, 0x22226644U, 0x2a2a7e54U,
-    0x9090ab3bU, 0x8888830bU, 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU,
-    0x14143c28U, 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
-    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, 0x4949db92U,
-    0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, 0xc2c25d9fU, 0xd3d36ebdU,
-    0xacacef43U, 0x6262a6c4U, 0x9191a839U, 0x9595a431U, 0xe4e437d3U,
-    0x79798bf2U, 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
-    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, 0x6c6cb4d8U,
-    0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, 0x6565afcaU, 0x7a7a8ef4U,
-    0xaeaee947U, 0x08081810U, 0xbabad56fU, 0x787888f0U, 0x25256f4aU,
-    0x2e2e725cU, 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
-    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, 0x4b4bdd96U,
-    0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, 0x707090e0U, 0x3e3e427cU,
-    0xb5b5c471U, 0x6666aaccU, 0x4848d890U, 0x03030506U, 0xf6f601f7U,
-    0x0e0e121cU, 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
-    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, 0xe1e138d9U,
-    0xf8f813ebU, 0x9898b32bU, 0x11113322U, 0x6969bbd2U, 0xd9d970a9U,
-    0x8e8e8907U, 0x9494a733U, 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U,
-    0xe9e920c9U, 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
-    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, 0xbfbfda65U,
-    0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, 0x4141c382U, 0x9999b029U,
-    0x2d2d775aU, 0x0f0f111eU, 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU,
-    0x16163a2cU, };
-
-static const uint32_t Td0[256] = {
-    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, 0x3bab6bcbU,
-    0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, 0x2030fa55U, 0xad766df6U,
-    0x88cc7691U, 0xf5024c25U, 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U,
-    0xb562a38fU, 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
-    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, 0x038f5fe7U,
-    0x15929c95U, 0xbf6d7aebU, 0x955259daU, 0xd4be832dU, 0x587421d3U,
-    0x49e06929U, 0x8ec9c844U, 0x75c2896aU, 0xf48e7978U, 0x99583e6bU,
-    0x27b971ddU, 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
-    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, 0xb16477e0U,
-    0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, 0x70486858U, 0x8f45fd19U,
-    0x94de6c87U, 0x527bf8b7U, 0xab73d323U, 0x724b02e2U, 0xe31f8f57U,
-    0x6655ab2aU, 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
-    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, 0x8acf1c2bU,
-    0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, 0x65daf4cdU, 0x0605bed5U,
-    0xd134621fU, 0xc4a6fe8aU, 0x342e539dU, 0xa2f355a0U, 0x058ae132U,
-    0xa4f6eb75U, 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
-    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, 0x91548db5U,
-    0x71c45d05U, 0x0406d46fU, 0x605015ffU, 0x1998fb24U, 0xd6bde997U,
-    0x894043ccU, 0x67d99e77U, 0xb0e842bdU, 0x07898b88U, 0xe7195b38U,
-    0x79c8eedbU, 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
-    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, 0xfd0efffbU,
-    0x0f853856U, 0x3daed51eU, 0x362d3927U, 0x0a0fd964U, 0x685ca621U,
-    0x9b5b54d1U, 0x24362e3aU, 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U,
-    0x1b9b919eU, 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
-    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, 0x0e090d0bU,
-    0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, 0x57f11985U, 0xaf75074cU,
-    0xee99ddbbU, 0xa37f60fdU, 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U,
-    0x5bfb7e34U, 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
-    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, 0x854a247dU,
-    0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, 0x1d9e2f4bU, 0xdcb230f3U,
-    0x0d8652ecU, 0x77c1e3d0U, 0x2bb3166cU, 0xa970b999U, 0x119448faU,
-    0x47e96422U, 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
-    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, 0xa6f581cfU,
-    0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, 0x2c3a9de4U, 0x5078920dU,
-    0x6a5fcc9bU, 0x547e4662U, 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU,
-    0x82c3aff5U, 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
-    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, 0xcd267809U,
-    0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, 0xe6956e65U, 0xaaffe67eU,
-    0x21bccf08U, 0xef15e8e6U, 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U,
-    0x29b07cd6U, 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
-    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, 0xf104984aU,
-    0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, 0x764dd68dU, 0x43efb04dU,
-    0xccaa4d54U, 0xe49604dfU, 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U,
-    0x4665517fU, 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
-    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, 0x9ad7618cU,
-    0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, 0xcea927eeU, 0xb761c935U,
-    0xe11ce5edU, 0x7a47b13cU, 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U,
-    0x73c737bfU, 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
-    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, 0x161dc372U,
-    0xbce2250cU, 0x283c498bU, 0xff0d9541U, 0x39a80171U, 0x080cb3deU,
-    0xd8b4e49cU, 0x6456c190U, 0x7bcb8461U, 0xd532b670U, 0x486c5c74U,
-    0xd0b85742U, };
-
-static const uint32_t Td1[256] = {
-    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, 0xcb3bab6bU,
-    0xf11f9d45U, 0xabacfa58U, 0x934be303U, 0x552030faU, 0xf6ad766dU,
-    0x9188cc76U, 0x25f5024cU, 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U,
-    0x8fb562a3U, 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
-    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, 0xe7038f5fU,
-    0x9515929cU, 0xebbf6d7aU, 0xda955259U, 0x2dd4be83U, 0xd3587421U,
-    0x2949e069U, 0x448ec9c8U, 0x6a75c289U, 0x78f48e79U, 0x6b99583eU,
-    0xdd27b971U, 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
-    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, 0xe0b16477U,
-    0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, 0x58704868U, 0x198f45fdU,
-    0x8794de6cU, 0xb7527bf8U, 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU,
-    0x2a6655abU, 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
-    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, 0x2b8acf1cU,
-    0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, 0xcd65daf4U, 0xd50605beU,
-    0x1fd13462U, 0x8ac4a6feU, 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U,
-    0x75a4f6ebU, 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
-    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, 0xb591548dU,
-    0x0571c45dU, 0x6f0406d4U, 0xff605015U, 0x241998fbU, 0x97d6bde9U,
-    0xcc894043U, 0x7767d99eU, 0xbdb0e842U, 0x8807898bU, 0x38e7195bU,
-    0xdb79c8eeU, 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
-    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, 0xfbfd0effU,
-    0x560f8538U, 0x1e3daed5U, 0x27362d39U, 0x640a0fd9U, 0x21685ca6U,
-    0xd19b5b54U, 0x3a24362eU, 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U,
-    0x9e1b9b91U, 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
-    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, 0x0b0e090dU,
-    0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, 0x8557f119U, 0x4caf7507U,
-    0xbbee99ddU, 0xfda37f60U, 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU,
-    0x345bfb7eU, 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
-    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, 0x7d854a24U,
-    0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, 0x4b1d9e2fU, 0xf3dcb230U,
-    0xec0d8652U, 0xd077c1e3U, 0x6c2bb316U, 0x99a970b9U, 0xfa119448U,
-    0x2247e964U, 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
-    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, 0xcfa6f581U,
-    0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, 0xe42c3a9dU, 0x0d507892U,
-    0x9b6a5fccU, 0x62547e46U, 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U,
-    0xf582c3afU, 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
-    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, 0x09cd2678U,
-    0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, 0x65e6956eU, 0x7eaaffe6U,
-    0x0821bccfU, 0xe6ef15e8U, 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U,
-    0xd629b07cU, 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
-    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, 0x4af10498U,
-    0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, 0x8d764dd6U, 0x4d43efb0U,
-    0x54ccaa4dU, 0xdfe49604U, 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU,
-    0x7f466551U, 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
-    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, 0x8c9ad761U,
-    0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, 0xeecea927U, 0x35b761c9U,
-    0xede11ce5U, 0x3c7a47b1U, 0x599cd2dfU, 0x3f55f273U, 0x791814ceU,
-    0xbf73c737U, 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
-    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, 0x72161dc3U,
-    0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, 0x7139a801U, 0xde080cb3U,
-    0x9cd8b4e4U, 0x906456c1U, 0x617bcb84U, 0x70d532b6U, 0x74486c5cU,
-    0x42d0b857U, };
-
-static const uint32_t Td2[256] = {
-    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, 0x6bcb3babU,
-    0x45f11f9dU, 0x58abacfaU, 0x03934be3U, 0xfa552030U, 0x6df6ad76U,
-    0x769188ccU, 0x4c25f502U, 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U,
-    0xa38fb562U, 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
-    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, 0x5fe7038fU,
-    0x9c951592U, 0x7aebbf6dU, 0x59da9552U, 0x832dd4beU, 0x21d35874U,
-    0x692949e0U, 0xc8448ec9U, 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U,
-    0x71dd27b9U, 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
-    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, 0x77e0b164U,
-    0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, 0x68587048U, 0xfd198f45U,
-    0x6c8794deU, 0xf8b7527bU, 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU,
-    0xab2a6655U, 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
-    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, 0x1c2b8acfU,
-    0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, 0xf4cd65daU, 0xbed50605U,
-    0x621fd134U, 0xfe8ac4a6U, 0x539d342eU, 0x55a0a2f3U, 0xe132058aU,
-    0xeb75a4f6U, 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
-    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, 0x8db59154U,
-    0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, 0xfb241998U, 0xe997d6bdU,
-    0x43cc8940U, 0x9e7767d9U, 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U,
-    0xeedb79c8U, 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
-    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, 0xfffbfd0eU,
-    0x38560f85U, 0xd51e3daeU, 0x3927362dU, 0xd9640a0fU, 0xa621685cU,
-    0x54d19b5bU, 0x2e3a2436U, 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU,
-    0x919e1b9bU, 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
-    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, 0x0d0b0e09U,
-    0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, 0x198557f1U, 0x074caf75U,
-    0xddbbee99U, 0x60fda37fU, 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U,
-    0x7e345bfbU, 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
-    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, 0x247d854aU,
-    0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, 0x2f4b1d9eU, 0x30f3dcb2U,
-    0x52ec0d86U, 0xe3d077c1U, 0x166c2bb3U, 0xb999a970U, 0x48fa1194U,
-    0x642247e9U, 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
-    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, 0x81cfa6f5U,
-    0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, 0x9de42c3aU, 0x920d5078U,
-    0xcc9b6a5fU, 0x4662547eU, 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U,
-    0xaff582c3U, 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
-    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, 0x7809cd26U,
-    0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, 0x6e65e695U, 0xe67eaaffU,
-    0xcf0821bcU, 0xe8e6ef15U, 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU,
-    0x7cd629b0U, 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
-    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, 0x984af104U,
-    0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, 0xd68d764dU, 0xb04d43efU,
-    0x4d54ccaaU, 0x04dfe496U, 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU,
-    0x517f4665U, 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
-    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, 0x618c9ad7U,
-    0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, 0x27eecea9U, 0xc935b761U,
-    0xe5ede11cU, 0xb13c7a47U, 0xdf599cd2U, 0x733f55f2U, 0xce791814U,
-    0x37bf73c7U, 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
-    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, 0xc372161dU,
-    0x250cbce2U, 0x498b283cU, 0x9541ff0dU, 0x017139a8U, 0xb3de080cU,
-    0xe49cd8b4U, 0xc1906456U, 0x84617bcbU, 0xb670d532U, 0x5c74486cU,
-    0x5742d0b8U, };
-
-static const uint32_t Td3[256] = {
-    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, 0xab6bcb3bU,
-    0x9d45f11fU, 0xfa58abacU, 0xe303934bU, 0x30fa5520U, 0x766df6adU,
-    0xcc769188U, 0x024c25f5U, 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U,
-    0x62a38fb5U, 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
-    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, 0x8f5fe703U,
-    0x929c9515U, 0x6d7aebbfU, 0x5259da95U, 0xbe832dd4U, 0x7421d358U,
-    0xe0692949U, 0xc9c8448eU, 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U,
-    0xb971dd27U, 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
-    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, 0x6477e0b1U,
-    0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, 0x48685870U, 0x45fd198fU,
-    0xde6c8794U, 0x7bf8b752U, 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U,
-    0x55ab2a66U, 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
-    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, 0xcf1c2b8aU,
-    0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, 0xdaf4cd65U, 0x05bed506U,
-    0x34621fd1U, 0xa6fe8ac4U, 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U,
-    0xf6eb75a4U, 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
-    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, 0x548db591U,
-    0xc45d0571U, 0x06d46f04U, 0x5015ff60U, 0x98fb2419U, 0xbde997d6U,
-    0x4043cc89U, 0xd99e7767U, 0xe842bdb0U, 0x898b8807U, 0x195b38e7U,
-    0xc8eedb79U, 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
-    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, 0x0efffbfdU,
-    0x8538560fU, 0xaed51e3dU, 0x2d392736U, 0x0fd9640aU, 0x5ca62168U,
-    0x5b54d19bU, 0x362e3a24U, 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U,
-    0x9b919e1bU, 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
-    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, 0x090d0b0eU,
-    0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, 0xf1198557U, 0x75074cafU,
-    0x99ddbbeeU, 0x7f60fda3U, 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U,
-    0xfb7e345bU, 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
-    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, 0x4a247d85U,
-    0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, 0x9e2f4b1dU, 0xb230f3dcU,
-    0x8652ec0dU, 0xc1e3d077U, 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U,
-    0xe9642247U, 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
-    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, 0xf581cfa6U,
-    0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, 0x3a9de42cU, 0x78920d50U,
-    0x5fcc9b6aU, 0x7e466254U, 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU,
-    0xc3aff582U, 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
-    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, 0x267809cdU,
-    0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, 0x956e65e6U, 0xffe67eaaU,
-    0xbccf0821U, 0x15e8e6efU, 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU,
-    0xb07cd629U, 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
-    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, 0x04984af1U,
-    0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, 0x4dd68d76U, 0xefb04d43U,
-    0xaa4d54ccU, 0x9604dfe4U, 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U,
-    0x65517f46U, 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
-    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, 0xd7618c9aU,
-    0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, 0xa927eeceU, 0x61c935b7U,
-    0x1ce5ede1U, 0x47b13c7aU, 0xd2df599cU, 0xf2733f55U, 0x14ce7918U,
-    0xc737bf73U, 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
-    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, 0x1dc37216U,
-    0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, 0xa8017139U, 0x0cb3de08U,
-    0xb4e49cd8U, 0x56c19064U, 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U,
-    0xb85742d0U, };
-
-static const uint8_t Td4[256] = {
-    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, 0xbfU, 0x40U, 0xa3U,
-    0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU,
-    0xffU, 0x87U, 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU, 0x54U,
-    0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU, 0xeeU, 0x4cU, 0x95U, 0x0bU,
-    0x42U, 0xfaU, 0xc3U, 0x4eU, 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U,
-    0xb2U, 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U, 0x72U, 0xf8U,
-    0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U, 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU,
-    0x65U, 0xb6U, 0x92U, 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
-    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U, 0x90U, 0xd8U, 0xabU,
-    0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU, 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U,
-    0x45U, 0x06U, 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U, 0xc1U,
-    0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU, 0x3aU, 0x91U, 0x11U, 0x41U,
-    0x4fU, 0x67U, 0xdcU, 0xeaU, 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U,
-    0x73U, 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U, 0xe2U, 0xf9U,
-    0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU, 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU,
-    0x29U, 0xc5U, 0x89U, 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
-    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U, 0x9aU, 0xdbU, 0xc0U,
-    0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U, 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U,
-    0xc7U, 0x31U, 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU, 0x60U,
-    0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU, 0x2dU, 0xe5U, 0x7aU, 0x9fU,
-    0x93U, 0xc9U, 0x9cU, 0xefU, 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U,
-    0xb0U, 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U, 0x17U, 0x2bU,
-    0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U, 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U,
-    0x21U, 0x0cU, 0x7dU, };
-
-static const uint32_t rcon[] = {
-    0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
-    0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000,
-    // for 128-bit blocks, Rijndael never uses more than 10 rcon values
-};
-
-int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
-                             AES_KEY *aeskey) {
-  uint32_t *rk;
-  int i = 0;
-  uint32_t temp;
-
-  if (!key || !aeskey) {
-    return -1;
-  }
-
-  switch (bits) {
-    case 128:
-      aeskey->rounds = 10;
-      break;
-    case 192:
-      aeskey->rounds = 12;
-      break;
-    case 256:
-      aeskey->rounds = 14;
-      break;
-    default:
-      return -2;
-  }
-
-  rk = aeskey->rd_key;
-
-  rk[0] = GETU32(key);
-  rk[1] = GETU32(key + 4);
-  rk[2] = GETU32(key + 8);
-  rk[3] = GETU32(key + 12);
-  if (bits == 128) {
-    while (1) {
-      temp = rk[3];
-      rk[4] = rk[0] ^ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
-              (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
-              (Te0[(temp) & 0xff] & 0x0000ff00) ^
-              (Te1[(temp >> 24)] & 0x000000ff) ^ rcon[i];
-      rk[5] = rk[1] ^ rk[4];
-      rk[6] = rk[2] ^ rk[5];
-      rk[7] = rk[3] ^ rk[6];
-      if (++i == 10) {
-        return 0;
-      }
-      rk += 4;
-    }
-  }
-  rk[4] = GETU32(key + 16);
-  rk[5] = GETU32(key + 20);
-  if (bits == 192) {
-    while (1) {
-      temp = rk[5];
-      rk[6] = rk[0] ^ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
-              (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
-              (Te0[(temp) & 0xff] & 0x0000ff00) ^
-              (Te1[(temp >> 24)] & 0x000000ff) ^ rcon[i];
-      rk[7] = rk[1] ^ rk[6];
-      rk[8] = rk[2] ^ rk[7];
-      rk[9] = rk[3] ^ rk[8];
-      if (++i == 8) {
-        return 0;
-      }
-      rk[10] = rk[4] ^ rk[9];
-      rk[11] = rk[5] ^ rk[10];
-      rk += 6;
-    }
-  }
-  rk[6] = GETU32(key + 24);
-  rk[7] = GETU32(key + 28);
-  if (bits == 256) {
-    while (1) {
-      temp = rk[7];
-      rk[8] = rk[0] ^ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
-              (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
-              (Te0[(temp) & 0xff] & 0x0000ff00) ^
-              (Te1[(temp >> 24)] & 0x000000ff) ^ rcon[i];
-      rk[9] = rk[1] ^ rk[8];
-      rk[10] = rk[2] ^ rk[9];
-      rk[11] = rk[3] ^ rk[10];
-      if (++i == 7) {
-        return 0;
-      }
-      temp = rk[11];
-      rk[12] = rk[4] ^ (Te2[(temp >> 24)] & 0xff000000) ^
-               (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
-               (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
-               (Te1[(temp) & 0xff] & 0x000000ff);
-      rk[13] = rk[5] ^ rk[12];
-      rk[14] = rk[6] ^ rk[13];
-      rk[15] = rk[7] ^ rk[14];
-
-      rk += 8;
-    }
-  }
-  return 0;
-}
-
-int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
-                             AES_KEY *aeskey) {
-  uint32_t *rk;
-  int i, j, status;
-  uint32_t temp;
-
-  // first, start with an encryption schedule
-  status = AES_set_encrypt_key(key, bits, aeskey);
-  if (status < 0) {
-    return status;
-  }
-
-  rk = aeskey->rd_key;
-
-  // invert the order of the round keys:
-  for (i = 0, j = 4 * aeskey->rounds; i < j; i += 4, j -= 4) {
-    temp = rk[i];
-    rk[i] = rk[j];
-    rk[j] = temp;
-    temp = rk[i + 1];
-    rk[i + 1] = rk[j + 1];
-    rk[j + 1] = temp;
-    temp = rk[i + 2];
-    rk[i + 2] = rk[j + 2];
-    rk[j + 2] = temp;
-    temp = rk[i + 3];
-    rk[i + 3] = rk[j + 3];
-    rk[j + 3] = temp;
-  }
-  // apply the inverse MixColumn transform to all round keys but the first and
-  // the last:
-  for (i = 1; i < (int)aeskey->rounds; i++) {
-    rk += 4;
-    rk[0] =
-        Td0[Te1[(rk[0] >> 24)] & 0xff] ^ Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
-        Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^ Td3[Te1[(rk[0]) & 0xff] & 0xff];
-    rk[1] =
-        Td0[Te1[(rk[1] >> 24)] & 0xff] ^ Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
-        Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^ Td3[Te1[(rk[1]) & 0xff] & 0xff];
-    rk[2] =
-        Td0[Te1[(rk[2] >> 24)] & 0xff] ^ Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
-        Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^ Td3[Te1[(rk[2]) & 0xff] & 0xff];
-    rk[3] =
-        Td0[Te1[(rk[3] >> 24)] & 0xff] ^ Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
-        Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^ Td3[Te1[(rk[3]) & 0xff] & 0xff];
-  }
-  return 0;
-}
-
-void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
-  const uint32_t *rk;
-  uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
-  int r;
-
-  assert(in && out && key);
-  rk = key->rd_key;
-
-  // map byte array block to cipher state
-  // and add initial round key:
-  s0 = GETU32(in) ^ rk[0];
-  s1 = GETU32(in + 4) ^ rk[1];
-  s2 = GETU32(in + 8) ^ rk[2];
-  s3 = GETU32(in + 12) ^ rk[3];
-
-  // Nr - 1 full rounds:
-  r = key->rounds >> 1;
-  for (;;) {
-    t0 = Te0[(s0 >> 24)] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^
-         Te3[(s3) & 0xff] ^ rk[4];
-    t1 = Te0[(s1 >> 24)] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^
-         Te3[(s0) & 0xff] ^ rk[5];
-    t2 = Te0[(s2 >> 24)] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^
-         Te3[(s1) & 0xff] ^ rk[6];
-    t3 = Te0[(s3 >> 24)] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^
-         Te3[(s2) & 0xff] ^ rk[7];
-
-    rk += 8;
-    if (--r == 0) {
-      break;
-    }
-
-    s0 = Te0[(t0 >> 24)] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^
-         Te3[(t3) & 0xff] ^ rk[0];
-    s1 = Te0[(t1 >> 24)] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^
-         Te3[(t0) & 0xff] ^ rk[1];
-    s2 = Te0[(t2 >> 24)] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^
-         Te3[(t1) & 0xff] ^ rk[2];
-    s3 = Te0[(t3 >> 24)] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^
-         Te3[(t2) & 0xff] ^ rk[3];
-  }
-
-  //  apply last round and map cipher state to byte array block:
-  s0 = (Te2[(t0 >> 24)] & 0xff000000) ^ (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
-       (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^ (Te1[(t3) & 0xff] & 0x000000ff) ^
-       rk[0];
-  PUTU32(out, s0);
-  s1 = (Te2[(t1 >> 24)] & 0xff000000) ^ (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
-       (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^ (Te1[(t0) & 0xff] & 0x000000ff) ^
-       rk[1];
-  PUTU32(out + 4, s1);
-  s2 = (Te2[(t2 >> 24)] & 0xff000000) ^ (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
-       (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^ (Te1[(t1) & 0xff] & 0x000000ff) ^
-       rk[2];
-  PUTU32(out + 8, s2);
-  s3 = (Te2[(t3 >> 24)] & 0xff000000) ^ (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
-       (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^ (Te1[(t2) & 0xff] & 0x000000ff) ^
-       rk[3];
-  PUTU32(out + 12, s3);
-}
-
-void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
-  const uint32_t *rk;
-  uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
-  int r;
-
-  assert(in && out && key);
-  rk = key->rd_key;
-
-  // map byte array block to cipher state
-  // and add initial round key:
-  s0 = GETU32(in) ^ rk[0];
-  s1 = GETU32(in + 4) ^ rk[1];
-  s2 = GETU32(in + 8) ^ rk[2];
-  s3 = GETU32(in + 12) ^ rk[3];
-
-  // Nr - 1 full rounds:
-  r = key->rounds >> 1;
-  for (;;) {
-    t0 = Td0[(s0 >> 24)] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^
-         Td3[(s1) & 0xff] ^ rk[4];
-    t1 = Td0[(s1 >> 24)] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^
-         Td3[(s2) & 0xff] ^ rk[5];
-    t2 = Td0[(s2 >> 24)] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^
-         Td3[(s3) & 0xff] ^ rk[6];
-    t3 = Td0[(s3 >> 24)] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^
-         Td3[(s0) & 0xff] ^ rk[7];
-
-    rk += 8;
-    if (--r == 0) {
-      break;
-    }
-
-    s0 = Td0[(t0 >> 24)] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^
-         Td3[(t1) & 0xff] ^ rk[0];
-    s1 = Td0[(t1 >> 24)] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^
-         Td3[(t2) & 0xff] ^ rk[1];
-    s2 = Td0[(t2 >> 24)] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^
-         Td3[(t3) & 0xff] ^ rk[2];
-    s3 = Td0[(t3 >> 24)] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^
-         Td3[(t0) & 0xff] ^ rk[3];
-  }
-
-  // apply last round and
-  // map cipher state to byte array block:
-  s0 = ((uint32_t)Td4[(t0 >> 24)] << 24) ^
-       ((uint32_t)Td4[(t3 >> 16) & 0xff] << 16) ^
-       ((uint32_t)Td4[(t2 >> 8) & 0xff] << 8) ^
-       ((uint32_t)Td4[(t1) & 0xff]) ^ rk[0];
-  PUTU32(out, s0);
-  s1 = ((uint32_t)Td4[(t1 >> 24)] << 24) ^
-       ((uint32_t)Td4[(t0 >> 16) & 0xff] << 16) ^
-       ((uint32_t)Td4[(t3 >> 8) & 0xff] << 8) ^
-       ((uint32_t)Td4[(t2) & 0xff]) ^ rk[1];
-  PUTU32(out + 4, s1);
-  s2 = ((uint32_t)Td4[(t2 >> 24)] << 24) ^
-       ((uint32_t)Td4[(t1 >> 16) & 0xff] << 16) ^
-       ((uint32_t)Td4[(t0 >> 8) & 0xff] << 8) ^
-       ((uint32_t)Td4[(t3) & 0xff]) ^ rk[2];
-  PUTU32(out + 8, s2);
-  s3 = ((uint32_t)Td4[(t3 >> 24)] << 24) ^
-       ((uint32_t)Td4[(t2 >> 16) & 0xff] << 16) ^
-       ((uint32_t)Td4[(t1 >> 8) & 0xff] << 8) ^
-       ((uint32_t)Td4[(t0) & 0xff]) ^ rk[3];
-  PUTU32(out + 12, s3);
-}
-
-#endif  // NO_ASM || (!X86 && !X86_64 && !ARM)
-
 // Be aware that different sets of AES functions use incompatible key
 // representations, varying in format of the key schedule, the |AES_KEY.rounds|
 // value, or both. Therefore they cannot mix. Also, on AArch64, the plain-C
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
new file mode 100644
index 0000000..ffbee89
--- /dev/null
+++ b/crypto/fipsmodule/aes/aes_nohw.c
@@ -0,0 +1,1282 @@
+/* Copyright (c) 2019, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/aes.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "../../internal.h"
+
+#if defined(OPENSSL_SSE2)
+#include <emmintrin.h>
+#endif
+
+
+// This file contains a constant-time implementation of AES, bitsliced with
+// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
+// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
+//
+// This implementation is based on the algorithms described in the following
+// references:
+// - https://bearssl.org/constanttime.html#aes
+// - https://eprint.iacr.org/2009/129.pdf
+// - https://eprint.iacr.org/2009/191.pdf
+
+
+// Word operations.
+//
+// An aes_word_t is the word used for this AES implementation. Throughout this
+// file, bits and bytes are ordered little-endian, though "left" and "right"
+// shifts match the operations themselves, which makes them reversed in a
+// little-endian, left-to-right reading.
+//
+// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
+// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
+// bits each, each corresponding to a byte in an AES block in column-major
+// order (AES's byte order). We refer to these as "logical bytes". Note, in the
+// 32-bit and 64-bit implementations, they are smaller than a byte. (The
+// contents of a logical byte will be described later.)
+//
+// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
+// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
+// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
+// value ranges from 0 to 15 independent of |aes_word_t| and
+// |AES_NOHW_BATCH_SIZE|.
+//
+// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
+// uses row-major order. Matching the AES order was easier to reason about, and
+// we do not have PSHUFB available to arbitrarily permute bytes.
+
+#if defined(OPENSSL_SSE2)
+typedef __m128i aes_word_t;
+// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
+// MSVC, so we define a constant.
+#define AES_NOHW_WORD_SIZE 16
+#define AES_NOHW_BATCH_SIZE 8
+#define AES_NOHW_ROW0_MASK \
+  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
+#define AES_NOHW_ROW1_MASK \
+  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
+#define AES_NOHW_ROW2_MASK \
+  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
+#define AES_NOHW_ROW3_MASK \
+  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
+#define AES_NOHW_COL01_MASK \
+  _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
+#define AES_NOHW_COL2_MASK \
+  _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
+#define AES_NOHW_COL3_MASK \
+  _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
+
+static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
+  return _mm_and_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
+  return _mm_or_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
+  return _mm_xor_si128(a, b);
+}
+
+static inline aes_word_t aes_nohw_not(aes_word_t a) {
+  return _mm_xor_si128(
+      a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
+}
+
+// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
+// must be constants.
+#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
+  _mm_slli_si128((a), (i))
+#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
+  _mm_srli_si128((a), (i))
+#else  // !OPENSSL_SSE2
+#if defined(OPENSSL_64_BIT)
+typedef uint64_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 8
+#define AES_NOHW_BATCH_SIZE 4
+#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
+#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
+#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
+#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
+#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
+#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
+#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
+#else  // !OPENSSL_64_BIT
+typedef uint32_t aes_word_t;
+#define AES_NOHW_WORD_SIZE 4
+#define AES_NOHW_BATCH_SIZE 2
+#define AES_NOHW_ROW0_MASK 0x03030303
+#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
+#define AES_NOHW_ROW2_MASK 0x30303030
+#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
+#define AES_NOHW_COL01_MASK 0x0000ffff
+#define AES_NOHW_COL2_MASK 0x00ff0000
+#define AES_NOHW_COL3_MASK 0xff000000
+#endif  // OPENSSL_64_BIT
+
+static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
+  return a & b;
+}
+
+static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
+  return a | b;
+}
+
+static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
+  return a ^ b;
+}
+
+static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
+
+static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
+  return a << (i * AES_NOHW_BATCH_SIZE);
+}
+
+static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
+  return a >> (i * AES_NOHW_BATCH_SIZE);
+}
+#endif  // OPENSSL_SSE2
+
+OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
+                      "batch size does not match word size");
+OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
+                      "AES_NOHW_WORD_SIZE is incorrect");
+
+
+// Block representations.
+//
+// This implementation uses three representations for AES blocks. First, the
+// public API represents blocks as uint8_t[16] in the usual way. Second, most
+// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
+// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
+// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
+// bars divide logical bytes):
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//   ...
+//
+// Finally, an individual block may be stored as an intermediate form in an
+// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
+// block, so that block[0]'s ith logical byte contains least-significant
+// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
+// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
+// "compacting" the block. Note this is no-op with 128-bit words because then
+// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
+// words, one block would be stored in two words:
+//
+//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
+//
+// Observe that the distances between corresponding bits in bitsliced and
+// compact bit orders match. If we line up corresponding words of each block,
+// the bitsliced and compact representations may be converted by tranposing bits
+// in corresponding logical bytes. Continuing the 64-bit example:
+//
+//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
+//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
+//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
+//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
+//
+//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
+//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
+//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
+//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
+//
+// Note also that bitwise operations and (logical) byte permutations on an
+// |aes_word_t| work equally for the bitsliced and compact words.
+//
+// We use the compact form in the |AES_KEY| representation to save work
+// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
+// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
+// before or after |aes_nohw_transpose|.
+
+#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
+
+// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
+// specified, it is in bitsliced form.
+typedef struct {
+  aes_word_t w[8];
+} AES_NOHW_BATCH;
+
+// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
+// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
+// |AES_KEY|s so it should not be used as a long-term key representation.
+typedef struct {
+  // keys is an array of batches, one for each round key. Each batch stores
+  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
+  AES_NOHW_BATCH keys[AES_MAXNR + 1];
+} AES_NOHW_SCHEDULE;
+
+// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
+                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
+  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
+  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
+  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
+  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
+  // will be correctly placed.)
+  assert(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_SSE2)
+  batch->w[i] = in[0];
+#elif defined(OPENSSL_64_BIT)
+  batch->w[i] = in[0];
+  batch->w[i + 4] = in[1];
+#else
+  batch->w[i] = in[0];
+  batch->w[i + 2] = in[1];
+  batch->w[i + 4] = in[2];
+  batch->w[i + 6] = in[3];
+#endif
+}
+
+// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
+// compact form.
+static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
+                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                      size_t i) {
+  assert(i < AES_NOHW_BATCH_SIZE);
+#if defined(OPENSSL_SSE2)
+  out[0] = batch->w[i];
+#elif defined(OPENSSL_64_BIT)
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 4];
+#else
+  out[0] = batch->w[i];
+  out[1] = batch->w[i + 2];
+  out[2] = batch->w[i + 4];
+  out[3] = batch->w[i + 6];
+#endif
+}
+
+#if !defined(OPENSSL_SSE2)
+// aes_nohw_delta_swap returns |a| with bits |a & mask| and
+// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
+static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
+                                             aes_word_t shift) {
+  // See
+  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
+  aes_word_t b = (a ^ (a >> shift)) & mask;
+  return a ^ b ^ (b << shift);
+}
+
+// In the 32-bit and 64-bit implementations, a block spans multiple words.
+// |aes_nohw_compact_block| must permute bits across different words. First we
+// implement |aes_nohw_compact_word| which performs a smaller version of the
+// transformation which stays within a single word.
+//
+// These transformations are generalizations of the output of
+// http://programming.sirrida.de/calcperm.php on smaller inputs.
+#if defined(OPENSSL_64_BIT)
+static inline uint64_t aes_nohw_compact_word(uint64_t a) {
+  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
+  // quartets of those chunks:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
+  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
+  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
+  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  return a;
+}
+
+static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
+  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
+  return a;
+}
+#else   // !OPENSSL_64_BIT
+static inline uint32_t aes_nohw_compact_word(uint32_t a) {
+  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
+  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
+  // Note:  0x00cc = 0b0000_0000_1100_1100
+  //   0x00cc << 6 = 0b0011_0011_0000_0000
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+  // Now we swap groups of four bits (still numbering by pairs):
+  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
+  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
+  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  return a;
+}
+
+static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
+  // Reverse the steps of |aes_nohw_uncompact_word|.
+  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
+  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
+  return a;
+}
+
+static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
+                                                uint8_t a2, uint8_t a3) {
+  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
+         ((uint32_t)a3 << 24);
+}
+#endif  // OPENSSL_64_BIT
+#endif  // !OPENSSL_SSE2
+
+static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                                          const uint8_t in[16]) {
+  memcpy(out, in, 16);
+#if defined(OPENSSL_SSE2)
+  // No conversions needed.
+#elif defined(OPENSSL_64_BIT)
+  uint64_t a0 = aes_nohw_compact_word(out[0]);
+  uint64_t a1 = aes_nohw_compact_word(out[1]);
+  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
+  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
+#else
+  uint32_t a0 = aes_nohw_compact_word(out[0]);
+  uint32_t a1 = aes_nohw_compact_word(out[1]);
+  uint32_t a2 = aes_nohw_compact_word(out[2]);
+  uint32_t a3 = aes_nohw_compact_word(out[3]);
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3);
+  out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
+  out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
+  out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
+#endif
+}
+
+static inline void aes_nohw_uncompact_block(
+    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+#if defined(OPENSSL_SSE2)
+  memcpy(out, in, 16);  // No conversions needed.
+#elif defined(OPENSSL_64_BIT)
+  uint64_t a0 = in[0];
+  uint64_t a1 = in[1];
+  uint64_t b0 =
+      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
+  uint64_t b1 =
+      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
+  memcpy(out, &b0, 8);
+  memcpy(out + 8, &b1, 8);
+#else
+  uint32_t a0 = in[0];
+  uint32_t a1 = in[1];
+  uint32_t a2 = in[2];
+  uint32_t a3 = in[3];
+  // Note clang, when building for ARM Thumb2, will sometimes miscompile
+  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+  // without optimizations. This bug was introduced in
+  // https://reviews.llvm.org/rL340261 and fixed in
+  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+  uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3);
+  uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
+  uint32_t b2 =
+      aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
+  uint32_t b3 =
+      aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
+  b0 = aes_nohw_uncompact_word(b0);
+  b1 = aes_nohw_uncompact_word(b1);
+  b2 = aes_nohw_uncompact_word(b2);
+  b3 = aes_nohw_uncompact_word(b3);
+  memcpy(out, &b0, 4);
+  memcpy(out + 4, &b1, 4);
+  memcpy(out + 8, &b2, 4);
+  memcpy(out + 12, &b3, 4);
+#endif
+}
+
+// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
+// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
+// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
+// is repeated to the full width of |aes_word_t|.
+#if defined(OPENSSL_SSE2)
+// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
+// constant shift values.
+#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
+                           /* uint32_t */ mask, /* const */ shift)        \
+  do {                                                                    \
+    __m128i swap =                                                        \
+        _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
+                      _mm_set_epi32((mask), (mask), (mask), (mask)));     \
+    *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
+    *(b) = _mm_xor_si128(*(b), swap);                                     \
+                                                                          \
+  } while (0)
+#else
+static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
+                                      uint32_t mask, aes_word_t shift) {
+#if defined(OPENSSL_64_BIT)
+  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
+#else
+  aes_word_t mask_w = mask;
+#endif
+  // This is a variation on a delta swap.
+  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
+  *a ^= swap << shift;
+  *b ^= swap;
+}
+#endif  // OPENSSL_SSE2
+
+// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
+// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
+// and transposes each square.
+static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
+  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
+  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
+
+#if AES_NOHW_BATCH_SIZE >= 4
+  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
+  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
+#endif
+
+#if AES_NOHW_BATCH_SIZE >= 8
+  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
+  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
+  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
+#endif
+}
+
+// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
+                              size_t num_blocks) {
+  // Don't leave unused blocks unitialized.
+  memset(out, 0, sizeof(AES_NOHW_BATCH));
+  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_compact_block(block, in + 16 * i);
+    aes_nohw_batch_set(out, block, i);
+  }
+
+  aes_nohw_transpose(out);
+}
+
+// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
+// |num_blocks| must be at most |AES_NOHW_BATCH|.
+static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
+                                const AES_NOHW_BATCH *batch) {
+  AES_NOHW_BATCH copy = *batch;
+  aes_nohw_transpose(&copy);
+
+  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
+  for (size_t i = 0; i < num_blocks; i++) {
+    aes_word_t block[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_batch_get(&copy, block, i);
+    aes_nohw_uncompact_block(out + 16 * i, block);
+  }
+}
+
+
+// AES round steps.
+
+static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
+                                   const AES_NOHW_BATCH *key) {
+  for (size_t i = 0; i < 8; i++) {
+    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
+  }
+}
+
+static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
+  aes_word_t x0 = batch->w[7];
+  aes_word_t x1 = batch->w[6];
+  aes_word_t x2 = batch->w[5];
+  aes_word_t x3 = batch->w[4];
+  aes_word_t x4 = batch->w[3];
+  aes_word_t x5 = batch->w[2];
+  aes_word_t x6 = batch->w[1];
+  aes_word_t x7 = batch->w[0];
+
+  // Figure 2, the top linear transformation.
+  aes_word_t y14 = aes_nohw_xor(x3, x5);
+  aes_word_t y13 = aes_nohw_xor(x0, x6);
+  aes_word_t y9 = aes_nohw_xor(x0, x3);
+  aes_word_t y8 = aes_nohw_xor(x0, x5);
+  aes_word_t t0 = aes_nohw_xor(x1, x2);
+  aes_word_t y1 = aes_nohw_xor(t0, x7);
+  aes_word_t y4 = aes_nohw_xor(y1, x3);
+  aes_word_t y12 = aes_nohw_xor(y13, y14);
+  aes_word_t y2 = aes_nohw_xor(y1, x0);
+  aes_word_t y5 = aes_nohw_xor(y1, x6);
+  aes_word_t y3 = aes_nohw_xor(y5, y8);
+  aes_word_t t1 = aes_nohw_xor(x4, y12);
+  aes_word_t y15 = aes_nohw_xor(t1, x5);
+  aes_word_t y20 = aes_nohw_xor(t1, x1);
+  aes_word_t y6 = aes_nohw_xor(y15, x7);
+  aes_word_t y10 = aes_nohw_xor(y15, t0);
+  aes_word_t y11 = aes_nohw_xor(y20, y9);
+  aes_word_t y7 = aes_nohw_xor(x7, y11);
+  aes_word_t y17 = aes_nohw_xor(y10, y11);
+  aes_word_t y19 = aes_nohw_xor(y10, y8);
+  aes_word_t y16 = aes_nohw_xor(t0, y11);
+  aes_word_t y21 = aes_nohw_xor(y13, y16);
+  aes_word_t y18 = aes_nohw_xor(x0, y16);
+
+  // Figure 3, the middle non-linear section.
+  aes_word_t t2 = aes_nohw_and(y12, y15);
+  aes_word_t t3 = aes_nohw_and(y3, y6);
+  aes_word_t t4 = aes_nohw_xor(t3, t2);
+  aes_word_t t5 = aes_nohw_and(y4, x7);
+  aes_word_t t6 = aes_nohw_xor(t5, t2);
+  aes_word_t t7 = aes_nohw_and(y13, y16);
+  aes_word_t t8 = aes_nohw_and(y5, y1);
+  aes_word_t t9 = aes_nohw_xor(t8, t7);
+  aes_word_t t10 = aes_nohw_and(y2, y7);
+  aes_word_t t11 = aes_nohw_xor(t10, t7);
+  aes_word_t t12 = aes_nohw_and(y9, y11);
+  aes_word_t t13 = aes_nohw_and(y14, y17);
+  aes_word_t t14 = aes_nohw_xor(t13, t12);
+  aes_word_t t15 = aes_nohw_and(y8, y10);
+  aes_word_t t16 = aes_nohw_xor(t15, t12);
+  aes_word_t t17 = aes_nohw_xor(t4, t14);
+  aes_word_t t18 = aes_nohw_xor(t6, t16);
+  aes_word_t t19 = aes_nohw_xor(t9, t14);
+  aes_word_t t20 = aes_nohw_xor(t11, t16);
+  aes_word_t t21 = aes_nohw_xor(t17, y20);
+  aes_word_t t22 = aes_nohw_xor(t18, y19);
+  aes_word_t t23 = aes_nohw_xor(t19, y21);
+  aes_word_t t24 = aes_nohw_xor(t20, y18);
+  aes_word_t t25 = aes_nohw_xor(t21, t22);
+  aes_word_t t26 = aes_nohw_and(t21, t23);
+  aes_word_t t27 = aes_nohw_xor(t24, t26);
+  aes_word_t t28 = aes_nohw_and(t25, t27);
+  aes_word_t t29 = aes_nohw_xor(t28, t22);
+  aes_word_t t30 = aes_nohw_xor(t23, t24);
+  aes_word_t t31 = aes_nohw_xor(t22, t26);
+  aes_word_t t32 = aes_nohw_and(t31, t30);
+  aes_word_t t33 = aes_nohw_xor(t32, t24);
+  aes_word_t t34 = aes_nohw_xor(t23, t33);
+  aes_word_t t35 = aes_nohw_xor(t27, t33);
+  aes_word_t t36 = aes_nohw_and(t24, t35);
+  aes_word_t t37 = aes_nohw_xor(t36, t34);
+  aes_word_t t38 = aes_nohw_xor(t27, t36);
+  aes_word_t t39 = aes_nohw_and(t29, t38);
+  aes_word_t t40 = aes_nohw_xor(t25, t39);
+  aes_word_t t41 = aes_nohw_xor(t40, t37);
+  aes_word_t t42 = aes_nohw_xor(t29, t33);
+  aes_word_t t43 = aes_nohw_xor(t29, t40);
+  aes_word_t t44 = aes_nohw_xor(t33, t37);
+  aes_word_t t45 = aes_nohw_xor(t42, t41);
+  aes_word_t z0 = aes_nohw_and(t44, y15);
+  aes_word_t z1 = aes_nohw_and(t37, y6);
+  aes_word_t z2 = aes_nohw_and(t33, x7);
+  aes_word_t z3 = aes_nohw_and(t43, y16);
+  aes_word_t z4 = aes_nohw_and(t40, y1);
+  aes_word_t z5 = aes_nohw_and(t29, y7);
+  aes_word_t z6 = aes_nohw_and(t42, y11);
+  aes_word_t z7 = aes_nohw_and(t45, y17);
+  aes_word_t z8 = aes_nohw_and(t41, y10);
+  aes_word_t z9 = aes_nohw_and(t44, y12);
+  aes_word_t z10 = aes_nohw_and(t37, y3);
+  aes_word_t z11 = aes_nohw_and(t33, y4);
+  aes_word_t z12 = aes_nohw_and(t43, y13);
+  aes_word_t z13 = aes_nohw_and(t40, y5);
+  aes_word_t z14 = aes_nohw_and(t29, y2);
+  aes_word_t z15 = aes_nohw_and(t42, y9);
+  aes_word_t z16 = aes_nohw_and(t45, y14);
+  aes_word_t z17 = aes_nohw_and(t41, y8);
+
+  // Figure 4, bottom linear transformation.
+  aes_word_t t46 = aes_nohw_xor(z15, z16);
+  aes_word_t t47 = aes_nohw_xor(z10, z11);
+  aes_word_t t48 = aes_nohw_xor(z5, z13);
+  aes_word_t t49 = aes_nohw_xor(z9, z10);
+  aes_word_t t50 = aes_nohw_xor(z2, z12);
+  aes_word_t t51 = aes_nohw_xor(z2, z5);
+  aes_word_t t52 = aes_nohw_xor(z7, z8);
+  aes_word_t t53 = aes_nohw_xor(z0, z3);
+  aes_word_t t54 = aes_nohw_xor(z6, z7);
+  aes_word_t t55 = aes_nohw_xor(z16, z17);
+  aes_word_t t56 = aes_nohw_xor(z12, t48);
+  aes_word_t t57 = aes_nohw_xor(t50, t53);
+  aes_word_t t58 = aes_nohw_xor(z4, t46);
+  aes_word_t t59 = aes_nohw_xor(z3, t54);
+  aes_word_t t60 = aes_nohw_xor(t46, t57);
+  aes_word_t t61 = aes_nohw_xor(z14, t57);
+  aes_word_t t62 = aes_nohw_xor(t52, t58);
+  aes_word_t t63 = aes_nohw_xor(t49, t58);
+  aes_word_t t64 = aes_nohw_xor(z4, t59);
+  aes_word_t t65 = aes_nohw_xor(t61, t62);
+  aes_word_t t66 = aes_nohw_xor(z1, t63);
+  aes_word_t s0 = aes_nohw_xor(t59, t63);
+  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
+  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
+  aes_word_t t67 = aes_nohw_xor(t64, t65);
+  aes_word_t s3 = aes_nohw_xor(t53, t66);
+  aes_word_t s4 = aes_nohw_xor(t51, t66);
+  aes_word_t s5 = aes_nohw_xor(t47, t65);
+  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
+  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
+
+  batch->w[0] = s7;
+  batch->w[1] = s6;
+  batch->w[2] = s5;
+  batch->w[3] = s4;
+  batch->w[4] = s3;
+  batch->w[5] = s2;
+  batch->w[6] = s1;
+  batch->w[7] = s0;
+}
+
+// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
+// S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
+static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) {
+  aes_word_t a0 = batch->w[0];
+  aes_word_t a1 = batch->w[1];
+  aes_word_t a2 = batch->w[2];
+  aes_word_t a3 = batch->w[3];
+  aes_word_t a4 = batch->w[4];
+  aes_word_t a5 = batch->w[5];
+  aes_word_t a6 = batch->w[6];
+  aes_word_t a7 = batch->w[7];
+
+  // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant
+  // [1 0 0 0 1 1 1 1].
+  aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7));
+  aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0));
+  aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1));
+  aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2));
+  aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3));
+  aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4));
+  aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5));
+  aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6));
+
+  // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant,
+  // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.)
+  batch->w[0] = aes_nohw_not(b0);
+  batch->w[1] = b1;
+  batch->w[2] = aes_nohw_not(b2);
+  batch->w[3] = b3;
+  batch->w[4] = b4;
+  batch->w[5] = b5;
+  batch->w[6] = b6;
+  batch->w[7] = b7;
+}
+
+static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) {
+  // We implement the inverse S-box using the forwards implementation with the
+  // technique described in https://www.bearssl.org/constanttime.html#aes.
+  //
+  // The forwards S-box inverts its input and applies an affine transformation:
+  // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then:
+  //
+  //   InvS(x) = Inv(InvA(x)).
+  //           = InvA(S(InvA(x)))
+  aes_nohw_sub_bytes_inv_affine(batch);
+  aes_nohw_sub_bytes(batch);
+  aes_nohw_sub_bytes_inv_affine(batch);
+}
+
+// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
+// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
+// constant shift counts in the SSE2 implementation.
+#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
+  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
+               aes_nohw_shift_left((v), 16 - (n)*4)))
+
+static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
+  for (size_t i = 0; i < 8; i++) {
+    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
+    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
+    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
+    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
+    row1 = aes_nohw_rotate_cols_right(row1, 1);
+    row2 = aes_nohw_rotate_cols_right(row2, 2);
+    row3 = aes_nohw_rotate_cols_right(row3, 3);
+    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
+  }
+}
+
+static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) {
+  for (size_t i = 0; i < 8; i++) {
+    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
+    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
+    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
+    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
+    row1 = aes_nohw_rotate_cols_right(row1, 3);
+    row2 = aes_nohw_rotate_cols_right(row2, 2);
+    row3 = aes_nohw_rotate_cols_right(row3, 1);
+    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
+  }
+}
+
+// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
+// down by one.
+static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
+#if defined(OPENSSL_SSE2)
+  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
+#elif defined(OPENSSL_64_BIT)
+  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
+         ((v << 12) & UINT64_C(0xf000f000f000f000));
+#else
+  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
+#endif
+}
+
+// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
+// by two.
+static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
+#if defined(OPENSSL_SSE2)
+  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
+#elif defined(OPENSSL_64_BIT)
+  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
+         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
+#else
+  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
+#endif
+}
+
+static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
+  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
+  aes_word_t a0 = batch->w[0];
+  aes_word_t a1 = batch->w[1];
+  aes_word_t a2 = batch->w[2];
+  aes_word_t a3 = batch->w[3];
+  aes_word_t a4 = batch->w[4];
+  aes_word_t a5 = batch->w[5];
+  aes_word_t a6 = batch->w[6];
+  aes_word_t a7 = batch->w[7];
+
+  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
+  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
+  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
+  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
+  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
+  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
+  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
+  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
+  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
+  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
+  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
+  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
+  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
+  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
+  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
+  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
+
+  batch->w[0] =
+      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
+  batch->w[1] =
+      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
+                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
+  batch->w[2] =
+      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
+  batch->w[3] =
+      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
+                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
+  batch->w[4] =
+      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
+                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
+  batch->w[5] =
+      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
+  batch->w[6] =
+      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
+  batch->w[7] =
+      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
+}
+
+static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) {
+  aes_word_t a0 = batch->w[0];
+  aes_word_t a1 = batch->w[1];
+  aes_word_t a2 = batch->w[2];
+  aes_word_t a3 = batch->w[3];
+  aes_word_t a4 = batch->w[4];
+  aes_word_t a5 = batch->w[5];
+  aes_word_t a6 = batch->w[6];
+  aes_word_t a7 = batch->w[7];
+
+  // bsaes-x86_64.pl describes the following decomposition of the inverse
+  // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler
+  // multiplication.
+  //
+  // | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+  // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+  // | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+  // | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+  //
+  // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described
+  // by the following bit equations:
+  //
+  //   b0 = a6
+  //   b1 = a6 ^ a7
+  //   b2 = a0 ^ a7
+  //   b3 = a1 ^ a6
+  //   b4 = a2 ^ a6 ^ a7
+  //   b5 = a3 ^ a7
+  //   b6 = a4
+  //   b7 = a5
+  //
+  // Each coefficient is given by:
+  //
+  //   b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij
+  //
+  // We combine the two equations below. Note a_i(j+2) is a row rotation.
+  aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0));
+  aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1));
+  aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2));
+  aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3));
+  aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4));
+  aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5));
+  aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6));
+  aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7));
+
+  batch->w[0] = aes_nohw_xor(a0, a6_r6);
+  batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7));
+  batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7));
+  batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6));
+  batch->w[4] =
+      aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7));
+  batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7));
+  batch->w[6] = aes_nohw_xor(a6, a4_r4);
+  batch->w[7] = aes_nohw_xor(a7, a5_r5);
+
+  // Apply the [02 03 01 01] matrix, which is just MixColumns.
+  aes_nohw_mix_columns(batch);
+}
+
+static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
+                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
+  aes_nohw_add_round_key(batch, &key->keys[0]);
+  for (size_t i = 1; i < num_rounds; i++) {
+    aes_nohw_sub_bytes(batch);
+    aes_nohw_shift_rows(batch);
+    aes_nohw_mix_columns(batch);
+    aes_nohw_add_round_key(batch, &key->keys[i]);
+  }
+  aes_nohw_sub_bytes(batch);
+  aes_nohw_shift_rows(batch);
+  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
+}
+
+static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key,
+                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
+  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
+  aes_nohw_inv_shift_rows(batch);
+  aes_nohw_inv_sub_bytes(batch);
+  for (size_t i = num_rounds - 1; i > 0; i--) {
+    aes_nohw_add_round_key(batch, &key->keys[i]);
+    aes_nohw_inv_mix_columns(batch);
+    aes_nohw_inv_shift_rows(batch);
+    aes_nohw_inv_sub_bytes(batch);
+  }
+  aes_nohw_add_round_key(batch, &key->keys[0]);
+}
+
+
+// Key schedule.
+
+static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
+                                       const AES_KEY *key) {
+  for (size_t i = 0; i <= key->rounds; i++) {
+    // Copy the round key into each block in the batch.
+    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
+      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
+      memcpy(tmp, key->rd_key + 4 * i, 16);
+      aes_nohw_batch_set(&out->keys[i], tmp, j);
+    }
+    aes_nohw_transpose(&out->keys[i]);
+  }
+}
+
+static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
+                                          0x20, 0x40, 0x80, 0x1b, 0x36};
+
+// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
+// |rcon|, stored in a |aes_word_t|.
+static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
+  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
+#if defined(OPENSSL_SSE2)
+  return _mm_set_epi32(0, 0, 0, rcon);
+#else
+  return ((aes_word_t)rcon);
+#endif
+}
+
+static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
+                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
+  AES_NOHW_BATCH batch;
+  memset(&batch, 0, sizeof(batch));
+  aes_nohw_batch_set(&batch, in, 0);
+  aes_nohw_transpose(&batch);
+  aes_nohw_sub_bytes(&batch);
+  aes_nohw_transpose(&batch);
+  aes_nohw_batch_get(&batch, out, 0);
+}
+
+static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
+  key->rounds = 10;
+
+  aes_word_t block[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block, in);
+  memcpy(key->rd_key, block, 16);
+
+  for (size_t i = 1; i <= 10; i++) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block);
+    uint8_t rcon = aes_nohw_rcon[i - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
+      block[j] = aes_nohw_xor(
+          block[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words. Note this is reordered from the usual
+      // formulation to avoid needing masks.
+      aes_word_t v = block[j];
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
+      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
+    }
+    memcpy(key->rd_key + 4 * i, block, 16);
+  }
+}
+
+static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
+  key->rounds = 12;
+
+  aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS];
+  aes_word_t *block1 = storage1, *block2 = storage2;
+
+  // AES-192's key schedule is complex because each key schedule iteration
+  // produces six words, but we compute on blocks and each block is four words.
+  // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
+  // We loop below every three blocks or two key schedule iterations.
+  //
+  // On entry to the loop, |block1| and the first half of |block2| contain the
+  // previous key schedule iteration. |block1| has been written to |key|, but
+  // |block2| has not as it is incomplete.
+  aes_nohw_compact_block(block1, in);
+  memcpy(key->rd_key, block1, 16);
+
+  uint8_t half_block[16] = {0};
+  memcpy(half_block, in + 16, 8);
+  aes_nohw_compact_block(block2, half_block);
+
+  for (size_t i = 0; i < 4; i++) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block2);
+    uint8_t rcon = aes_nohw_rcon[2 * i];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Compute the first two words of the next key schedule iteration, which
+      // go in the second half of |block2|. The first two words of the previous
+      // iteration are in the first half of |block1|. Apply |rcon| here too
+      // because the shifts match.
+      block2[j] = aes_nohw_or(
+          block2[j],
+          aes_nohw_shift_left(
+              aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
+      // Incorporate the transformed word and propagate. Note the last word of
+      // the previous iteration corresponds to the second word of |copy|. This
+      // is incorporated into the first word of the next iteration, or the third
+      // word of |block2|.
+      block2[j] = aes_nohw_xor(
+          block2[j], aes_nohw_and(aes_nohw_shift_left(
+                                      aes_nohw_rotate_rows_down(sub[j]), 4),
+                                  AES_NOHW_COL2_MASK));
+      block2[j] = aes_nohw_xor(
+          block2[j],
+          aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK));
+
+      // Compute the remaining four words, which fill |block1|. Begin by moving
+      // the corresponding words of the previous iteration: the second half of
+      // |block1| and the first half of |block2|.
+      block1[j] = aes_nohw_shift_right(block1[j], 8);
+      block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
+      // Incorporate the second word, computed previously in |block2|, and
+      // propagate.
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
+      aes_word_t v = block1[j];
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
+    }
+
+    // This completes two round keys. Note half of |block2| was computed in the
+    // previous loop iteration but was not yet output.
+    memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
+    memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
+
+    aes_nohw_sub_block(sub, block1);
+    rcon = aes_nohw_rcon[2 * i + 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Compute the first four words of the next key schedule iteration in
+      // |block2|. Begin by moving the corresponding words of the previous
+      // iteration: the second half of |block2| and the first half of |block1|.
+      block2[j] = aes_nohw_shift_right(block2[j], 8);
+      block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
+      // Incorporate rcon and the transformed word. Note the last word of the
+      // previous iteration corresponds to the last word of |copy|.
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
+      block2[j] = aes_nohw_xor(
+          block2[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block2[j];
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
+
+      // Compute the last two words, which go in the first half of |block1|. The
+      // last two words of the previous iteration are in the second half of
+      // |block1|.
+      block1[j] = aes_nohw_shift_right(block1[j], 8);
+      // Propagate blocks and mask off the excess.
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
+      block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
+    }
+
+    // |block2| has a complete round key. |block1| will be completed in the next
+    // iteration.
+    memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
+
+    // Swap blocks to restore the invariant.
+    aes_word_t *tmp = block1;
+    block1 = block2;
+    block2 = tmp;
+  }
+}
+
+static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
+  key->rounds = 14;
+
+  // Each key schedule iteration produces two round keys.
+  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
+  aes_nohw_compact_block(block1, in);
+  memcpy(key->rd_key, block1, 16);
+
+  aes_nohw_compact_block(block2, in + 16);
+  memcpy(key->rd_key + 4, block2, 16);
+
+  for (size_t i = 2; i <= 14; i += 2) {
+    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
+    aes_nohw_sub_block(sub, block2);
+    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate |rcon| and the transformed word into the first word.
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
+      block1[j] = aes_nohw_xor(
+          block1[j],
+          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block1[j];
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
+      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
+    }
+    memcpy(key->rd_key + 4 * i, block1, 16);
+
+    if (i == 14) {
+      break;
+    }
+
+    aes_nohw_sub_block(sub, block1);
+    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
+      // Incorporate the transformed word into the first word.
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
+      // Propagate to the remaining words.
+      aes_word_t v = block2[j];
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
+      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
+    }
+    memcpy(key->rd_key + 4 * (i + 1), block2, 16);
+  }
+}
+
+
+// External API.
+
+int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
+                             AES_KEY *aeskey) {
+  switch (bits) {
+    case 128:
+      aes_nohw_setup_key_128(aeskey, key);
+      return 0;
+    case 192:
+      aes_nohw_setup_key_192(aeskey, key);
+      return 0;
+    case 256:
+      aes_nohw_setup_key_256(aeskey, key);
+      return 0;
+  }
+  return 1;
+}
+
+int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
+                             AES_KEY *aeskey) {
+  return aes_nohw_set_encrypt_key(key, bits, aeskey);
+}
+
+void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+  AES_NOHW_BATCH batch;
+  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
+  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
+}
+
+void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+  AES_NOHW_BATCH batch;
+  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
+  aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
+  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
+}
+
+static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
+                                      const uint8_t b[16]) {
+  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
+    aes_word_t x, y;
+    memcpy(&x, a + i, sizeof(aes_word_t));
+    memcpy(&y, b + i, sizeof(aes_word_t));
+    x = aes_nohw_xor(x, y);
+    memcpy(out + i, &x, sizeof(aes_word_t));
+  }
+}
+
+void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
+                                   size_t blocks, const AES_KEY *key,
+                                   const uint8_t ivec[16]) {
+  if (blocks == 0) {
+    return;
+  }
+
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+
+  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
+  alignas(AES_NOHW_WORD_SIZE) union {
+    uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
+    uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
+  } ivs, enc_ivs;
+  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+    memcpy(ivs.u8 + 16 * i, ivec, 16);
+  }
+
+  uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
+  for (;;) {
+    // Update counters.
+    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+      ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
+    }
+
+    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
+    AES_NOHW_BATCH batch;
+    aes_nohw_to_batch(&batch, ivs.u8, todo);
+    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+    aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
+
+    for (size_t i = 0; i < todo; i++) {
+      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
+    }
+
+    blocks -= todo;
+    if (blocks == 0) {
+      break;
+    }
+
+    in += 16 * AES_NOHW_BATCH_SIZE;
+    out += 16 * AES_NOHW_BATCH_SIZE;
+    ctr += AES_NOHW_BATCH_SIZE;
+  }
+}
+
+void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                          const AES_KEY *key, uint8_t *ivec, const int enc) {
+  assert(len % 16 == 0);
+  size_t blocks = len / 16;
+  if (blocks == 0) {
+    return;
+  }
+
+  AES_NOHW_SCHEDULE sched;
+  aes_nohw_expand_round_keys(&sched, key);
+  alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16];
+  memcpy(iv, ivec, 16);
+
+  if (enc) {
+    // CBC encryption is not parallelizable.
+    while (blocks > 0) {
+      aes_nohw_xor_block(iv, iv, in);
+
+      AES_NOHW_BATCH batch;
+      aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1);
+      aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
+      aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
+
+      memcpy(iv, out, 16);
+
+      in += 16;
+      out += 16;
+      blocks--;
+    }
+    memcpy(ivec, iv, 16);
+    return;
+  }
+
+  for (;;) {
+    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
+    // Make a copy of the input so we can decrypt in-place.
+    alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16];
+    memcpy(copy, in, todo * 16);
+
+    AES_NOHW_BATCH batch;
+    aes_nohw_to_batch(&batch, in, todo);
+    aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
+    aes_nohw_from_batch(out, todo, &batch);
+
+    aes_nohw_xor_block(out, out, iv);
+    for (size_t i = 1; i < todo; i++) {
+      aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1));
+    }
+
+    // Save the last block as the IV.
+    memcpy(iv, copy + 16 * (todo - 1), 16);
+
+    blocks -= todo;
+    if (blocks == 0) {
+      break;
+    }
+
+    in += 16 * AES_NOHW_BATCH_SIZE;
+    out += 16 * AES_NOHW_BATCH_SIZE;
+  }
+
+  memcpy(ivec, iv, 16);
+}
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index 2e2911c..406e949 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -288,26 +288,6 @@
       block_counts = {0, 1, 8};
     }
 
-    CHECK_ABI(aes_nohw_set_encrypt_key, kKey, bits, &key);
-    CHECK_ABI(aes_nohw_encrypt, block, block, &key);
-#if defined(AES_NOHW_CBC)
-    for (size_t blocks : block_counts) {
-      SCOPED_TRACE(blocks);
-      CHECK_ABI(aes_nohw_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key,
-                block, AES_ENCRYPT);
-    }
-#endif
-
-    CHECK_ABI(aes_nohw_set_decrypt_key, kKey, bits, &key);
-    CHECK_ABI(aes_nohw_decrypt, block, block, &key);
-#if defined(AES_NOHW_CBC)
-    for (size_t blocks : block_counts) {
-      SCOPED_TRACE(blocks);
-      CHECK_ABI(aes_nohw_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key,
-                block, AES_DECRYPT);
-    }
-#endif
-
     if (bsaes_capable()) {
       vpaes_set_encrypt_key(kKey, bits, &key);
       CHECK_ABI(vpaes_encrypt_key_to_bsaes, &key, &key);
diff --git a/crypto/fipsmodule/aes/asm/aes-586.pl b/crypto/fipsmodule/aes/asm/aes-586.pl
deleted file mode 100755
index 9b373de..0000000
--- a/crypto/fipsmodule/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,3000 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 4.3.
-#
-# You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
-# to be *the* best Intel C compiler without -KPIC, performance appears
-# to be virtually identical... But try to re-configure with shared
-# library support... Aha! Intel compiler "suddenly" lags behind by 30%
-# [on P4, more on others]:-) And if compared to position-independent
-# code generated by GNU C, this code performs *more* than *twice* as
-# fast! Yes, all this buzz about PIC means that unlike other hand-
-# coded implementations, this one was explicitly designed to be safe
-# to use even in shared library context... This also means that this
-# code isn't necessarily absolutely fastest "ever," because in order
-# to achieve position independence an extra register has to be
-# off-loaded to stack, which affects the benchmark result.
-#
-# Special note about instruction choice. Do you recall RC4_INT code
-# performing poorly on P4? It might be the time to figure out why.
-# RC4_INT code implies effective address calculations in base+offset*4
-# form. Trouble is that it seems that offset scaling turned to be
-# critical path... At least eliminating scaling resulted in 2.8x RC4
-# performance improvement [as you might recall]. As AES code is hungry
-# for scaling too, I [try to] avoid the latter by favoring off-by-2
-# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
-#
-# As was shown by Dean Gaudet, the above note turned out to be
-# void. Performance improvement with off-by-2 shifts was observed on
-# intermediate implementation, which was spilling yet another register
-# to stack... Final offset*4 code below runs just a tad faster on P4,
-# but exhibits up to 10% improvement on other cores.
-#
-# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
-# This made it possible to implement little-endian variant of the
-# algorithm without modifying the base C code. Motivating factor for
-# the undertaken effort was that it appeared that in tight IA-32
-# register window little-endian flavor could achieve slightly higher
-# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
-#
-# Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance improvement of CBC benchmark results. 40% was
-# observed on P4 core, where "overall" improvement coefficient, i.e. if
-# compared to PIC generated by GCC and in CBC mode, was observed to be
-# as large as 4x:-) CBC performance is virtually identical to ECB now
-# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
-# Opteron, because certain function prologues and epilogues are
-# effectively taken out of the loop...
-#
-# Version 3.2 implements compressed tables and prefetch of these tables
-# in CBC[!] mode. Former means that 3/4 of table references are now
-# misaligned, which unfortunately has negative impact on elder IA-32
-# implementations, Pentium suffered 30% penalty, PIII - 10%.
-#
-# Version 3.3 avoids L1 cache aliasing between stack frame and
-# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
-# latter is achieved by copying the key schedule to controlled place in
-# stack. This unfortunately has rather strong impact on small block CBC
-# performance, ~2x deterioration on 16-byte block if compared to 3.3.
-#
-# Version 3.5 checks if there is L1 cache aliasing between user-supplied
-# key schedule and S-boxes and abstains from copying the former if
-# there is no. This allows end-user to consciously retain small block
-# performance by aligning key schedule in specific manner.
-#
-# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
-#
-# Current ECB performance numbers for 128-bit key in CPU cycles per
-# processed byte [measure commonly used by AES benchmarkers] are:
-#
-#		small footprint		fully unrolled
-# P4		24			22
-# AMD K8	20			19
-# PIII		25			23
-# Pentium	81			78
-#
-# Version 3.7 reimplements outer rounds as "compact." Meaning that
-# first and last rounds reference compact 256 bytes S-box. This means
-# that first round consumes a lot more CPU cycles and that encrypt
-# and decrypt performance becomes asymmetric. Encrypt performance
-# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
-# aggressively pre-fetched.
-#
-# Version 4.0 effectively rolls back to 3.6 and instead implements
-# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
-# which use exclusively 256 byte S-box. These functions are to be
-# called in modes not concealing plain text, such as ECB, or when
-# we're asked to process smaller amount of data [or unconditionally
-# on hyper-threading CPU]. Currently it's called unconditionally from
-# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
-# still needs to be modified to switch between slower and faster
-# mode when appropriate... But in either case benchmark landscape
-# changes dramatically and below numbers are CPU cycles per processed
-# byte for 128-bit key.
-#
-#		ECB encrypt	ECB decrypt	CBC large chunk
-# P4		52[54]		83[95]		23
-# AMD K8	46[41]		66[70]		18
-# PIII		41[50]		60[77]		24
-# Core 2	31[36]		45[64]		18.5
-# Atom		76[100]		96[138]		60
-# Pentium	115		150		77
-#
-# Version 4.1 switches to compact S-box even in key schedule setup.
-#
-# Version 4.2 prefetches compact S-box in every SSE round or in other
-# words every cache-line is *guaranteed* to be accessed within ~50
-# cycles window. Why just SSE? Because it's needed on hyper-threading
-# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
-#
-# Version 4.3 implements switch between compact and non-compact block
-# functions in AES_cbc_encrypt depending on how much data was asked
-# to be processed in one stroke.
-#
-######################################################################
-# Timing attacks are classified in two classes: synchronous when
-# attacker consciously initiates cryptographic operation and collects
-# timing data of various character afterwards, and asynchronous when
-# malicious code is executed on same CPU simultaneously with AES,
-# instruments itself and performs statistical analysis of this data.
-#
-# As far as synchronous attacks go the root to the AES timing
-# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
-# are referred to in single 128-bit block operation. Well, in C
-# implementation with 4 distinct tables it's actually as little as 40
-# references per 256 elements table, but anyway... Secondly, even
-# though S-box elements are clustered into smaller amount of cache-
-# lines, smaller than 160 and even 40, it turned out that for certain
-# plain-text pattern[s] or simply put chosen plain-text and given key
-# few cache-lines remain unaccessed during block operation. Now, if
-# attacker can figure out this access pattern, he can deduct the key
-# [or at least part of it]. The natural way to mitigate this kind of
-# attacks is to minimize the amount of cache-lines in S-box and/or
-# prefetch them to ensure that every one is accessed for more uniform
-# timing. But note that *if* plain-text was concealed in such way that
-# input to block function is distributed *uniformly*, then attack
-# wouldn't apply. Now note that some encryption modes, most notably
-# CBC, do mask the plain-text in this exact way [secure cipher output
-# is distributed uniformly]. Yes, one still might find input that
-# would reveal the information about given key, but if amount of
-# candidate inputs to be tried is larger than amount of possible key
-# combinations then attack becomes infeasible. This is why revised
-# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
-# of data is to be processed in one stroke. The current size limit of
-# 512 bytes is chosen to provide same [diminishingly low] probability
-# for cache-line to remain untouched in large chunk operation with
-# large S-box as for single block operation with compact S-box and
-# surely needs more careful consideration...
-#
-# As for asynchronous attacks. There are two flavours: attacker code
-# being interleaved with AES on hyper-threading CPU at *instruction*
-# level, and two processes time sharing single core. As for latter.
-# Two vectors. 1. Given that attacker process has higher priority,
-# yield execution to process performing AES just before timer fires
-# off the scheduler, immediately regain control of CPU and analyze the
-# cache state. For this attack to be efficient attacker would have to
-# effectively slow down the operation by several *orders* of magnitude,
-# by ratio of time slice to duration of handful of AES rounds, which
-# unlikely to remain unnoticed. Not to mention that this also means
-# that he would spend correspondingly more time to collect enough
-# statistical data to mount the attack. It's probably appropriate to
-# say that if adversary reckons that this attack is beneficial and
-# risks to be noticed, you probably have larger problems having him
-# mere opportunity. In other words suggested code design expects you
-# to preclude/mitigate this attack by overall system security design.
-# 2. Attacker manages to make his code interrupt driven. In order for
-# this kind of attack to be feasible, interrupt rate has to be high
-# enough, again comparable to duration of handful of AES rounds. But
-# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
-# generates interrupts at such raging rate...
-#
-# And now back to the former, hyper-threading CPU or more specifically
-# Intel P4. Recall that asynchronous attack implies that malicious
-# code instruments itself. And naturally instrumentation granularity
-# has be noticeably lower than duration of codepath accessing S-box.
-# Given that all cache-lines are accessed during that time that is.
-# Current implementation accesses *all* cache-lines within ~50 cycles
-# window, which is actually *less* than RDTSC latency on Intel P4!
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../../perlasm");
-require "x86asm.pl";
-
-$output = pop;
-open OUT,">$output";
-*STDOUT=*OUT;
-
-&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
-&static_label("AES_Te");
-&static_label("AES_Td");
-
-$s0="eax";
-$s1="ebx";
-$s2="ecx";
-$s3="edx";
-$key="edi";
-$acc="esi";
-$tbl="ebp";
-
-# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
-# by caller
-$__ra=&DWP(0,"esp");	# return address
-$__s0=&DWP(4,"esp");	# s0 backing store
-$__s1=&DWP(8,"esp");	# s1 backing store
-$__s2=&DWP(12,"esp");	# s2 backing store
-$__s3=&DWP(16,"esp");	# s3 backing store
-$__key=&DWP(20,"esp");	# pointer to key schedule
-$__end=&DWP(24,"esp");	# pointer to end of key schedule
-$__tbl=&DWP(28,"esp");	# %ebp backing store
-
-# stack frame layout in AES_[en|crypt] routines, which differs from
-# above by 4 and overlaps by %ebp backing store
-$_tbl=&DWP(24,"esp");
-$_esp=&DWP(28,"esp");
-
-sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-
-$speed_limit=512;	# chunks smaller than $speed_limit are
-			# processed with compact routine in CBC mode
-$small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
-			# recent µ-archs], but ~5 times smaller!
-			# I favor compact code to minimize cache
-			# contention and in hope to "collect" 5% back
-			# in real-life applications...
-
-$vertical_spin=0;	# shift "vertically" defaults to 0, because of
-			# its proof-of-concept status...
-# Note that there is no decvert(), as well as last encryption round is
-# performed with "horizontal" shifts. This is because this "vertical"
-# implementation [one which groups shifts on a given $s[i] to form a
-# "column," unlike "horizontal" one, which groups shifts on different
-# $s[i] to form a "row"] is work in progress. It was observed to run
-# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
-# whole 12% slower:-( So we face a trade-off... Shall it be resolved
-# some day? Till then the code is considered experimental and by
-# default remains dormant...
-
-sub encvert()
-{ my ($te,@s) = @_;
-  my ($v0,$v1) = ($acc,$key);
-
-	&mov	($v0,$s[3]);				# copy s3
-	&mov	(&DWP(4,"esp"),$s[2]);			# save s2
-	&mov	($v1,$s[0]);				# copy s0
-	&mov	(&DWP(8,"esp"),$s[1]);			# save s1
-
-	&movz	($s[2],&HB($s[0]));
-	&and	($s[0],0xFF);
-	&mov	($s[0],&DWP(0,$te,$s[0],8));		# s0>>0
-	&shr	($v1,16);
-	&mov	($s[3],&DWP(3,$te,$s[2],8));		# s0>>8
-	&movz	($s[1],&HB($v1));
-	&and	($v1,0xFF);
-	&mov	($s[2],&DWP(2,$te,$v1,8));		# s0>>16
-	 &mov	($v1,$v0);
-	&mov	($s[1],&DWP(1,$te,$s[1],8));		# s0>>24
-
-	&and	($v0,0xFF);
-	&xor	($s[3],&DWP(0,$te,$v0,8));		# s3>>0
-	&movz	($v0,&HB($v1));
-	&shr	($v1,16);
-	&xor	($s[2],&DWP(3,$te,$v0,8));		# s3>>8
-	&movz	($v0,&HB($v1));
-	&and	($v1,0xFF);
-	&xor	($s[1],&DWP(2,$te,$v1,8));		# s3>>16
-	 &mov	($v1,&DWP(4,"esp"));			# restore s2
-	&xor	($s[0],&DWP(1,$te,$v0,8));		# s3>>24
-
-	&mov	($v0,$v1);
-	&and	($v1,0xFF);
-	&xor	($s[2],&DWP(0,$te,$v1,8));		# s2>>0
-	&movz	($v1,&HB($v0));
-	&shr	($v0,16);
-	&xor	($s[1],&DWP(3,$te,$v1,8));		# s2>>8
-	&movz	($v1,&HB($v0));
-	&and	($v0,0xFF);
-	&xor	($s[0],&DWP(2,$te,$v0,8));		# s2>>16
-	 &mov	($v0,&DWP(8,"esp"));			# restore s1
-	&xor	($s[3],&DWP(1,$te,$v1,8));		# s2>>24
-
-	&mov	($v1,$v0);
-	&and	($v0,0xFF);
-	&xor	($s[1],&DWP(0,$te,$v0,8));		# s1>>0
-	&movz	($v0,&HB($v1));
-	&shr	($v1,16);
-	&xor	($s[0],&DWP(3,$te,$v0,8));		# s1>>8
-	&movz	($v0,&HB($v1));
-	&and	($v1,0xFF);
-	&xor	($s[3],&DWP(2,$te,$v1,8));		# s1>>16
-	 &mov	($key,$__key);				# reincarnate v1 as key
-	&xor	($s[2],&DWP(1,$te,$v0,8));		# s1>>24
-}
-
-# Another experimental routine, which features "horizontal spin," but
-# eliminates one reference to stack. Strangely enough runs slower...
-sub enchoriz()
-{ my ($v0,$v1) = ($key,$acc);
-
-	&movz	($v0,&LB($s0));			#  3, 2, 1, 0*
-	&rotr	($s2,8);			#  8,11,10, 9
-	&mov	($v1,&DWP(0,$te,$v0,8));	#  0
-	&movz	($v0,&HB($s1));			#  7, 6, 5*, 4
-	&rotr	($s3,16);			# 13,12,15,14
-	&xor	($v1,&DWP(3,$te,$v0,8));	#  5
-	&movz	($v0,&HB($s2));			#  8,11,10*, 9
-	&rotr	($s0,16);			#  1, 0, 3, 2
-	&xor	($v1,&DWP(2,$te,$v0,8));	# 10
-	&movz	($v0,&HB($s3));			# 13,12,15*,14
-	&xor	($v1,&DWP(1,$te,$v0,8));	# 15, t[0] collected
-	&mov	($__s0,$v1);			# t[0] saved
-
-	&movz	($v0,&LB($s1));			#  7, 6, 5, 4*
-	&shr	($s1,16);			#  -, -, 7, 6
-	&mov	($v1,&DWP(0,$te,$v0,8));	#  4
-	&movz	($v0,&LB($s3));			# 13,12,15,14*
-	&xor	($v1,&DWP(2,$te,$v0,8));	# 14
-	&movz	($v0,&HB($s0));			#  1, 0, 3*, 2
-	&and	($s3,0xffff0000);		# 13,12, -, -
-	&xor	($v1,&DWP(1,$te,$v0,8));	#  3
-	&movz	($v0,&LB($s2));			#  8,11,10, 9*
-	&or	($s3,$s1);			# 13,12, 7, 6
-	&xor	($v1,&DWP(3,$te,$v0,8));	#  9, t[1] collected
-	&mov	($s1,$v1);			#  s[1]=t[1]
-
-	&movz	($v0,&LB($s0));			#  1, 0, 3, 2*
-	&shr	($s2,16);			#  -, -, 8,11
-	&mov	($v1,&DWP(2,$te,$v0,8));	#  2
-	&movz	($v0,&HB($s3));			# 13,12, 7*, 6
-	&xor	($v1,&DWP(1,$te,$v0,8));	#  7
-	&movz	($v0,&HB($s2));			#  -, -, 8*,11
-	&xor	($v1,&DWP(0,$te,$v0,8));	#  8
-	&mov	($v0,$s3);
-	&shr	($v0,24);			# 13
-	&xor	($v1,&DWP(3,$te,$v0,8));	# 13, t[2] collected
-
-	&movz	($v0,&LB($s2));			#  -, -, 8,11*
-	&shr	($s0,24);			#  1*
-	&mov	($s2,&DWP(1,$te,$v0,8));	# 11
-	&xor	($s2,&DWP(3,$te,$s0,8));	#  1
-	&mov	($s0,$__s0);			# s[0]=t[0]
-	&movz	($v0,&LB($s3));			# 13,12, 7, 6*
-	&shr	($s3,16);			#   ,  ,13,12
-	&xor	($s2,&DWP(2,$te,$v0,8));	#  6
-	&mov	($key,$__key);			# reincarnate v0 as key
-	&and	($s3,0xff);			#   ,  ,13,12*
-	&mov	($s3,&DWP(0,$te,$s3,8));	# 12
-	&xor	($s3,$s2);			# s[2]=t[3] collected
-	&mov	($s2,$v1);			# s[2]=t[2]
-}
-
-# More experimental code... SSE one... Even though this one eliminates
-# *all* references to stack, it's not faster...
-sub sse_encbody()
-{
-	&movz	($acc,&LB("eax"));		#  0
-	&mov	("ecx",&DWP(0,$tbl,$acc,8));	#  0
-	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
-	&movz	("edx",&HB("eax"));		#  1
-	&mov	("edx",&DWP(3,$tbl,"edx",8));	#  1
-	&shr	("eax",16);			#  5, 4
-
-	&movz	($acc,&LB("ebx"));		# 10
-	&xor	("ecx",&DWP(2,$tbl,$acc,8));	# 10
-	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
-	&movz	($acc,&HB("ebx"));		# 11
-	&xor	("edx",&DWP(1,$tbl,$acc,8));	# 11
-	&shr	("ebx",16);			# 15,14
-
-	&movz	($acc,&HB("eax"));		#  5
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	#  5
-	&movq	("mm3",QWP(16,$key));
-	&movz	($acc,&HB("ebx"));		# 15
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	# 15
-	&movd	("mm0","ecx");			# t[0] collected
-
-	&movz	($acc,&LB("eax"));		#  4
-	&mov	("ecx",&DWP(0,$tbl,$acc,8));	#  4
-	&movd	("eax","mm2");			#  7, 6, 3, 2
-	&movz	($acc,&LB("ebx"));		# 14
-	&xor	("ecx",&DWP(2,$tbl,$acc,8));	# 14
-	&movd	("ebx","mm6");			# 13,12, 9, 8
-
-	&movz	($acc,&HB("eax"));		#  3
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	#  3
-	&movz	($acc,&HB("ebx"));		#  9
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	#  9
-	&movd	("mm1","ecx");			# t[1] collected
-
-	&movz	($acc,&LB("eax"));		#  2
-	&mov	("ecx",&DWP(2,$tbl,$acc,8));	#  2
-	&shr	("eax",16);			#  7, 6
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-	&movz	($acc,&LB("ebx"));		#  8
-	&xor	("ecx",&DWP(0,$tbl,$acc,8));	#  8
-	&shr	("ebx",16);			# 13,12
-
-	&movz	($acc,&HB("eax"));		#  7
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	#  7
-	&pxor	("mm0","mm3");
-	&movz	("eax",&LB("eax"));		#  6
-	&xor	("edx",&DWP(2,$tbl,"eax",8));	#  6
-	&pshufw	("mm1","mm0",0x08);		#  5, 4, 1, 0
-	&movz	($acc,&HB("ebx"));		# 13
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	# 13
-	&xor	("ecx",&DWP(24,$key));		# t[2]
-	&movd	("mm4","ecx");			# t[2] collected
-	&movz	("ebx",&LB("ebx"));		# 12
-	&xor	("edx",&DWP(0,$tbl,"ebx",8));	# 12
-	&shr	("ecx",16);
-	&movd	("eax","mm1");			#  5, 4, 1, 0
-	&mov	("ebx",&DWP(28,$key));		# t[3]
-	&xor	("ebx","edx");
-	&movd	("mm5","ebx");			# t[3] collected
-	&and	("ebx",0xffff0000);
-	&or	("ebx","ecx");
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub enccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# $Fn is used in first compact round and its purpose is to
-	# void restoration of some values from stack, so that after
-	# 4xenccompact with extra argument $key value is left there...
-	if ($i==3)  {	&$Fn	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&movz	($out,&BP(-128,$te,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24);			}
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-	&comment();
-}
-
-sub enctransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $tbl;
-  my $r2  = $key ;
-
-	&and	($tmp,$s[$i]);
-	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&and	($r2,0xfefefefe);
-	&sub	($acc,$tmp);
-	&mov	($tmp,$s[$i]);
-	&and	($acc,0x1b1b1b1b);
-	&rotr	($tmp,16);
-	&xor	($acc,$r2);	# r2
-	&mov	($r2,$s[$i]);
-
-	&xor	($s[$i],$acc);	# r0 ^ r2
-	&rotr	($r2,16+8);
-	&xor	($acc,$tmp);
-	&rotl	($s[$i],24);
-	&xor	($acc,$r2);
-	&mov	($tmp,0x80808080)	if ($i!=1);
-	&xor	($s[$i],$acc);	# ROTATE(r2^r0,24) ^ r2
-}
-
-&function_begin_B("_x86_AES_encrypt_compact");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	# prefetch Te4
-	&mov	($key,&DWP(0-128,$tbl));
-	&mov	($acc,&DWP(32-128,$tbl));
-	&mov	($key,&DWP(64-128,$tbl));
-	&mov	($acc,&DWP(96-128,$tbl));
-	&mov	($key,&DWP(128-128,$tbl));
-	&mov	($acc,&DWP(160-128,$tbl));
-	&mov	($key,&DWP(192-128,$tbl));
-	&mov	($acc,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-
-		&enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
-		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
-		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
-		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
-		&mov	($tbl,0x80808080);
-		&enctransform(2);
-		&enctransform(3);
-		&enctransform(0);
-		&enctransform(1);
-		&mov 	($key,$__key);
-		&mov	($tbl,$__tbl);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-	&cmp	($key,$__end);
-	&mov	($__key,$key);
-	&jb	(&label("loop"));
-
-	&enccompact(0,$tbl,$s0,$s1,$s2,$s3);
-	&enccompact(1,$tbl,$s1,$s2,$s3,$s0);
-	&enccompact(2,$tbl,$s2,$s3,$s0,$s1);
-	&enccompact(3,$tbl,$s3,$s0,$s1,$s2);
-
-	&xor	($s0,&DWP(16,$key));
-	&xor	($s1,&DWP(20,$key));
-	&xor	($s2,&DWP(24,$key));
-	&xor	($s3,&DWP(28,$key));
-
-	&ret	();
-&function_end_B("_x86_AES_encrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-#
-# Performance is not actually extraordinary in comparison to pure
-# x86 code. In particular encrypt performance is virtually the same.
-# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
-# better on PIII:-) And additionally on the pros side this code
-# eliminates redundant references to stack and thus relieves/
-# minimizes the pressure on the memory bus.
-#
-# MMX register layout                           lsb
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |          mm4          |          mm0          |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |     s3    |     s2    |     s1    |     s0    |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-#
-# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
-# In this terms encryption and decryption "compact" permutation
-# matrices can be depicted as following:
-#
-# encryption              lsb	# decryption              lsb
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t0 || 15 | 10 |  5 |  0 |	# | t0 ||  7 | 10 | 13 |  0 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t1 ||  3 | 14 |  9 |  4 |	# | t1 || 11 | 14 |  1 |  4 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t2 ||  7 |  2 | 13 |  8 |	# | t2 || 15 |  2 |  5 |  8 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t3 || 11 |  6 |  1 | 12 |	# | t3 ||  3 |  6 |  9 | 12 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-#
-######################################################################
-# Why not xmm registers? Short answer. It was actually tested and
-# was not any faster, but *contrary*, most notably on Intel CPUs.
-# Longer answer. Main advantage of using mm registers is that movd
-# latency is lower, especially on Intel P4. While arithmetic
-# instructions are twice as many, they can be scheduled every cycle
-# and not every second one when they are operating on xmm register,
-# so that "arithmetic throughput" remains virtually the same. And
-# finally the code can be executed even on elder SSE-only CPUs:-)
-
-sub sse_enccompact()
-{
-	&pshufw	("mm1","mm0",0x08);		#  5, 4, 1, 0
-	&pshufw	("mm5","mm4",0x0d);		# 15,14,11,10
-	&movd	("eax","mm1");			#  5, 4, 1, 0
-	&movd	("ebx","mm5");			# 15,14,11,10
-	&mov	($__key,$key);
-
-	&movz	($acc,&LB("eax"));		#  0
-	&movz	("edx",&HB("eax"));		#  1
-	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&movz	($key,&LB("ebx"));		# 10
-	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
-	&shr	("eax",16);			#  5, 4
-	&shl	("edx",8);			#  1
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
-	&movz	($key,&HB("ebx"));		# 11
-	&shl	($acc,16);			# 10
-	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
-	&or	("ecx",$acc);			# 10
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
-	&movz	($key,&HB("eax"));		#  5
-	&shl	($acc,24);			# 11
-	&shr	("ebx",16);			# 15,14
-	&or	("edx",$acc);			# 11
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
-	&movz	($key,&HB("ebx"));		# 15
-	&shl	($acc,8);			#  5
-	&or	("ecx",$acc);			#  5
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 15
-	&movz	($key,&LB("eax"));		#  4
-	&shl	($acc,24);			# 15
-	&or	("ecx",$acc);			# 15
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
-	&movz	($key,&LB("ebx"));		# 14
-	&movd	("eax","mm2");			#  7, 6, 3, 2
-	&movd	("mm0","ecx");			# t[0] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 14
-	&movz	($key,&HB("eax"));		#  3
-	&shl	("ecx",16);			# 14
-	&movd	("ebx","mm6");			# 13,12, 9, 8
-	&or	("ecx",$acc);			# 14
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  3
-	&movz	($key,&HB("ebx"));		#  9
-	&shl	($acc,24);			#  3
-	&or	("ecx",$acc);			#  3
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
-	&movz	($key,&LB("ebx"));		#  8
-	&shl	($acc,8);			#  9
-	&shr	("ebx",16);			# 13,12
-	&or	("ecx",$acc);			#  9
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  8
-	&movz	($key,&LB("eax"));		#  2
-	&shr	("eax",16);			#  7, 6
-	&movd	("mm1","ecx");			# t[1] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	#  2
-	&movz	($key,&HB("eax"));		#  7
-	&shl	("ecx",16);			#  2
-	&and	("eax",0xff);			#  6
-	&or	("ecx",$acc);			#  2
-
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
-	&movz	($key,&HB("ebx"));		# 13
-	&shl	($acc,24);			#  7
-	&and	("ebx",0xff);			# 12
-	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  6
-	&or	("ecx",$acc);			#  7
-	&shl	("eax",16);			#  6
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
-	&or	("edx","eax");			#  6
-	&shl	($acc,8);			# 13
-	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	# 12
-	&or	("ecx",$acc);			# 13
-	&or	("edx","ebx");			# 12
-	&mov	($key,$__key);
-	&movd	("mm4","ecx");			# t[2] collected
-	&movd	("mm5","edx");			# t[3] collected
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-					if (!$x86only) {
-&function_begin_B("_sse_AES_encrypt_compact");
-	&pxor	("mm0",&QWP(0,$key));	#  7, 6, 5, 4, 3, 2, 1, 0
-	&pxor	("mm4",&QWP(8,$key));	# 15,14,13,12,11,10, 9, 8
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	&mov	($s0,0x1b1b1b1b);		# magic constant
-	&mov	(&DWP(8,"esp"),$s0);
-	&mov	(&DWP(12,"esp"),$s0);
-
-	# prefetch Te4
-	&mov	($s0,&DWP(0-128,$tbl));
-	&mov	($s1,&DWP(32-128,$tbl));
-	&mov	($s2,&DWP(64-128,$tbl));
-	&mov	($s3,&DWP(96-128,$tbl));
-	&mov	($s0,&DWP(128-128,$tbl));
-	&mov	($s1,&DWP(160-128,$tbl));
-	&mov	($s2,&DWP(192-128,$tbl));
-	&mov	($s3,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-		&sse_enccompact();
-		&add	($key,16);
-		&cmp	($key,$__end);
-		&ja	(&label("out"));
-
-		&movq	("mm2",&QWP(8,"esp"));
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&movq	("mm1","mm0");		&movq	("mm5","mm4");	# r0
-		&pcmpgtb("mm3","mm0");		&pcmpgtb("mm7","mm4");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&pshufw	("mm2","mm0",0xb1);	&pshufw	("mm6","mm4",0xb1);# ROTATE(r0,16)
-		&paddb	("mm0","mm0");		&paddb	("mm4","mm4");
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# = r2
-		&pshufw	("mm3","mm2",0xb1);	&pshufw	("mm7","mm6",0xb1);# r0
-		&pxor	("mm1","mm0");		&pxor	("mm5","mm4");	# r0^r2
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= ROTATE(r0,16)
-
-		&movq	("mm2","mm3");		&movq	("mm6","mm7");
-		&pslld	("mm3",8);		&pslld	("mm7",8);
-		&psrld	("mm2",24);		&psrld	("mm6",24);
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= r0<<8
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= r0>>24
-
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&movq	("mm2",&QWP(0,$key));	&movq	("mm6",&QWP(8,$key));
-		&psrld	("mm1",8);		&psrld	("mm5",8);
-		&mov	($s0,&DWP(0-128,$tbl));
-		&pslld	("mm3",24);		&pslld	("mm7",24);
-		&mov	($s1,&DWP(64-128,$tbl));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= (r2^r0)<<8
-		&mov	($s2,&DWP(128-128,$tbl));
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= (r2^r0)>>24
-		&mov	($s3,&DWP(192-128,$tbl));
-
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");
-	&jmp	(&label("loop"));
-
-	&set_label("out",16);
-	&pxor	("mm0",&QWP(0,$key));
-	&pxor	("mm4",&QWP(8,$key));
-
-	&ret	();
-&function_end_B("_sse_AES_encrypt_compact");
-					}
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub encstep()
-{ my ($i,$te,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# lines marked with #%e?x[i] denote "reordered" instructions...
-	if ($i==3)  {	&mov	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);
-			&and	($out,0xFF);			}
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&mov	($out,&DWP(0,$te,$out,8));
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&xor	($out,&DWP(3,$te,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&xor	($out,&DWP(2,$te,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24)			}
-			&xor	($out,&DWP(1,$te,$tmp,8));
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-			&comment();
-}
-
-sub enclast()
-{ my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	if ($i==3)  {	&mov	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&mov	($out,&DWP(2,$te,$out,8));
-			&and	($out,0x000000ff);
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&mov	($tmp,&DWP(0,$te,$tmp,8));
-			&and	($tmp,0x0000ff00);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&mov	($tmp,&DWP(0,$te,$tmp,8));
-			&and	($tmp,0x00ff0000);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24);			}
-			&mov	($tmp,&DWP(2,$te,$tmp,8));
-			&and	($tmp,0xff000000);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-}
-
-&function_begin_B("_x86_AES_encrypt");
-	if ($vertical_spin) {
-		# I need high parts of volatile registers to be accessible...
-		&exch	($s1="edi",$key="ebx");
-		&mov	($s2="esi",$acc="ecx");
-	}
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	if ($small_footprint) {
-	    &lea	($acc,&DWP(-2,$acc,$acc));
-	    &lea	($acc,&DWP(0,$key,$acc,8));
-	    &mov	($__end,$acc);		# end of key schedule
-
-	    &set_label("loop",16);
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-	    &cmp	($key,$__end);
-	    &mov	($__key,$key);
-	    &jb		(&label("loop"));
-	}
-	else {
-	    &cmp	($acc,10);
-	    &jle	(&label("10rounds"));
-	    &cmp	($acc,12);
-	    &jle	(&label("12rounds"));
-
-	&set_label("14rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("12rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("10rounds",4);
-	    for ($i=1;$i<10;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	}
-
-	if ($vertical_spin) {
-	    # "reincarnate" some registers for "horizontal" spin...
-	    &mov	($s1="ebx",$key="edi");
-	    &mov	($s2="ecx",$acc="esi");
-	}
-	&enclast(0,$tbl,$s0,$s1,$s2,$s3);
-	&enclast(1,$tbl,$s1,$s2,$s3,$s0);
-	&enclast(2,$tbl,$s2,$s3,$s0,$s1);
-	&enclast(3,$tbl,$s3,$s0,$s1,$s2);
-
-	&add	($key,$small_footprint?16:160);
-	&xor	($s0,&DWP(0,$key));
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&ret	();
-
-&set_label("AES_Te",64);	# Yes! I keep it in the code segment!
-	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-	&data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-	&data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
-	&data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
-	&data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
-&function_end_B("_x86_AES_encrypt");
-
-# void aes_nohw_encrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("aes_nohw_encrypt");
-	&mov	($acc,&wparam(0));		# load inp
-	&mov	($key,&wparam(2));		# load key
-
-	&mov	($s0,"esp");
-	&sub	("esp",36);
-	&and	("esp",-64);			# align to cache-line
-
-	# place stack frame just "above" the key schedule
-	&lea	($s1,&DWP(-64-63,$key));
-	&sub	($s1,"esp");
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	("esp",$s1);
-	&add	("esp",4);	# 4 is reserved for caller's return address
-	&mov	($_esp,$s0);			# save stack pointer
-
-	&call   (&label("pic_point"));          # make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
-	&lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	&lea	($s1,&DWP(768-4,"esp"));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),25);	# check for SSE bit
-	&jnc	(&label("x86"));
-
-	&movq	("mm0",&QWP(0,$acc));
-	&movq	("mm4",&QWP(8,$acc));
-	&call	("_sse_AES_encrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&movq	(&QWP(0,$acc),"mm0");		# write output data
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&function_end_A();
-					}
-	&set_label("x86",16);
-	&mov	($_tbl,$tbl);
-	&mov	($s0,&DWP(0,$acc));		# load input data
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-	&call	("_x86_AES_encrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&mov	(&DWP(0,$acc),$s0);		# write output data
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-&function_end("aes_nohw_encrypt");
-
-#--------------------------------------------------------------------#
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub deccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# $Fn is used in first compact round and its purpose is to
-	# void restoration of some values from stack, so that after
-	# 4xdeccompact with extra argument $key, $s0 and $s1 values
-	# are left there...
-	if($i==3)   {	&$Fn	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&movz	($out,&BP(-128,$td,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &$Fn ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&$Fn	($s[3],$__s0);			}
-}
-
-# must be called with 2,3,0,1 as argument sequence!!!
-sub dectransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $key;
-  my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
-  my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
-  my $tp8 = $tbl;
-
-	&mov	($tmp,0x80808080);
-	&and	($tmp,$s[$i]);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
-	&sub	($acc,$tmp);
-	&and	($tp2,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	&xor	($tp2,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp2);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp4,&DWP(0,$tp2,$tp2));
-	&sub	($acc,$tmp);
-	&and	($tp4,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp2,$s[$i]);	# tp2^tp1
-	&xor	($tp4,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp4);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp8,&DWP(0,$tp4,$tp4));
-	&sub	($acc,$tmp);
-	&and	($tp8,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp4,$s[$i]);	# tp4^tp1
-	 &rotl	($s[$i],8);	# = ROTATE(tp1,8)
-	&xor	($tp8,$acc);
-
-	&xor	($s[$i],$tp2);
-	&xor	($tp2,$tp8);
-	&xor	($s[$i],$tp4);
-	&xor	($tp4,$tp8);
-	&rotl	($tp2,24);
-	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
-	&rotl	($tp4,16);
-	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
-	&rotl	($tp8,8);
-	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
-	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
-	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
-	 &mov	($s[2],$__s2)			if($i==1);
-	&xor	($s[$i],$tp8);	# ^= ROTATE(tp8,8)
-
-	&mov	($s[3],$__s3)			if($i==1);
-	&mov	(&DWP(4+4*$i,"esp"),$s[$i])	if($i>=2);
-}
-
-&function_begin_B("_x86_AES_decrypt_compact");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	# prefetch Td4
-	&mov	($key,&DWP(0-128,$tbl));
-	&mov	($acc,&DWP(32-128,$tbl));
-	&mov	($key,&DWP(64-128,$tbl));
-	&mov	($acc,&DWP(96-128,$tbl));
-	&mov	($key,&DWP(128-128,$tbl));
-	&mov	($acc,&DWP(160-128,$tbl));
-	&mov	($key,&DWP(192-128,$tbl));
-	&mov	($acc,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-
-		&deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
-		&deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
-		&deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
-		&deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
-		&dectransform(2);
-		&dectransform(3);
-		&dectransform(0);
-		&dectransform(1);
-		&mov 	($key,$__key);
-		&mov	($tbl,$__tbl);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-	&cmp	($key,$__end);
-	&mov	($__key,$key);
-	&jb	(&label("loop"));
-
-	&deccompact(0,$tbl,$s0,$s3,$s2,$s1);
-	&deccompact(1,$tbl,$s1,$s0,$s3,$s2);
-	&deccompact(2,$tbl,$s2,$s1,$s0,$s3);
-	&deccompact(3,$tbl,$s3,$s2,$s1,$s0);
-
-	&xor	($s0,&DWP(16,$key));
-	&xor	($s1,&DWP(20,$key));
-	&xor	($s2,&DWP(24,$key));
-	&xor	($s3,&DWP(28,$key));
-
-	&ret	();
-&function_end_B("_x86_AES_decrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-
-sub sse_deccompact()
-{
-	&pshufw	("mm1","mm0",0x0c);		#  7, 6, 1, 0
-	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
-	&movd	("eax","mm1");			#  7, 6, 1, 0
-	&movd	("ebx","mm5");			# 13,12,11,10
-	&mov	($__key,$key);
-
-	&movz	($acc,&LB("eax"));		#  0
-	&movz	("edx",&HB("eax"));		#  1
-	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&movz	($key,&LB("ebx"));		# 10
-	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
-	&shr	("eax",16);			#  7, 6
-	&shl	("edx",8);			#  1
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
-	&movz	($key,&HB("ebx"));		# 11
-	&shl	($acc,16);			# 10
-	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
-	&or	("ecx",$acc);			# 10
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
-	&movz	($key,&HB("eax"));		#  7
-	&shl	($acc,24);			# 11
-	&shr	("ebx",16);			# 13,12
-	&or	("edx",$acc);			# 11
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
-	&movz	($key,&HB("ebx"));		# 13
-	&shl	($acc,24);			#  7
-	&or	("ecx",$acc);			#  7
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
-	&movz	($key,&LB("eax"));		#  6
-	&shl	($acc,8);			# 13
-	&movd	("eax","mm2");			#  3, 2, 5, 4
-	&or	("ecx",$acc);			# 13
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  6
-	&movz	($key,&LB("ebx"));		# 12
-	&shl	($acc,16);			#  6
-	&movd	("ebx","mm6");			#  9, 8,15,14
-	&movd	("mm0","ecx");			# t[0] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 12
-	&movz	($key,&LB("eax"));		#  4
-	&or	("ecx",$acc);			# 12
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
-	&movz	($key,&LB("ebx"));		# 14
-	&or	("edx",$acc);			#  4
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 14
-	&movz	($key,&HB("eax"));		#  5
-	&shl	($acc,16);			# 14
-	&shr	("eax",16);			#  3, 2
-	&or	("edx",$acc);			# 14
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
-	&movz	($key,&HB("ebx"));		# 15
-	&shr	("ebx",16);			#  9, 8
-	&shl	($acc,8);			#  5
-	&movd	("mm1","edx");			# t[1] collected
-	&movz	("edx",&BP(-128,$tbl,$key,1));	# 15
-	&movz	($key,&HB("ebx"));		#  9
-	&shl	("edx",24);			# 15
-	&and	("ebx",0xff);			#  8
-	&or	("edx",$acc);			# 15
-
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
-	&movz	($key,&LB("eax"));		#  2
-	&shl	($acc,8);			#  9
-	&movz	("eax",&HB("eax"));		#  3
-	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	#  8
-	&or	("ecx",$acc);			#  9
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  2
-	&or	("edx","ebx");			#  8
-	&shl	($acc,16);			#  2
-	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  3
-	&or	("edx",$acc);			#  2
-	&shl	("eax",24);			#  3
-	&or	("ecx","eax");			#  3
-	&mov	($key,$__key);
-	&movd	("mm4","edx");			# t[2] collected
-	&movd	("mm5","ecx");			# t[3] collected
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-					if (!$x86only) {
-&function_begin_B("_sse_AES_decrypt_compact");
-	&pxor	("mm0",&QWP(0,$key));	#  7, 6, 5, 4, 3, 2, 1, 0
-	&pxor	("mm4",&QWP(8,$key));	# 15,14,13,12,11,10, 9, 8
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	&mov	($s0,0x1b1b1b1b);		# magic constant
-	&mov	(&DWP(8,"esp"),$s0);
-	&mov	(&DWP(12,"esp"),$s0);
-
-	# prefetch Td4
-	&mov	($s0,&DWP(0-128,$tbl));
-	&mov	($s1,&DWP(32-128,$tbl));
-	&mov	($s2,&DWP(64-128,$tbl));
-	&mov	($s3,&DWP(96-128,$tbl));
-	&mov	($s0,&DWP(128-128,$tbl));
-	&mov	($s1,&DWP(160-128,$tbl));
-	&mov	($s2,&DWP(192-128,$tbl));
-	&mov	($s3,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-		&sse_deccompact();
-		&add	($key,16);
-		&cmp	($key,$__end);
-		&ja	(&label("out"));
-
-		# ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
-		&movq	("mm3","mm0");		&movq	("mm7","mm4");
-		&movq	("mm2","mm0",1);	&movq	("mm6","mm4",1);
-		&movq	("mm1","mm0");		&movq	("mm5","mm4");
-		&pshufw	("mm0","mm0",0xb1);	&pshufw	("mm4","mm4",0xb1);# = ROTATE(tp0,16)
-		&pslld	("mm2",8);		&pslld	("mm6",8);
-		&psrld	("mm3",8);		&psrld	("mm7",8);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp0<<8
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp0>>8
-		&pslld	("mm2",16);		&pslld	("mm6",16);
-		&psrld	("mm3",16);		&psrld	("mm7",16);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp0<<24
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp0>>24
-
-		&movq	("mm3",&QWP(8,"esp"));
-		&pxor	("mm2","mm2");		&pxor	("mm6","mm6");
-		&pcmpgtb("mm2","mm1");		&pcmpgtb("mm6","mm5");
-		&pand	("mm2","mm3");		&pand	("mm6","mm3");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm2");		&pxor	("mm5","mm6");	# tp2
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&movq	("mm2","mm1");		&movq	("mm6","mm5");
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp2
-		&pslld	("mm3",24);		&pslld	("mm7",24);
-		&psrld	("mm2",8);		&psrld	("mm6",8);
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp2<<24
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp2>>8
-
-		&movq	("mm2",&QWP(8,"esp"));
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&pcmpgtb("mm3","mm1");		&pcmpgtb("mm7","mm5");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm3");		&pxor	("mm5","mm7");	# tp4
-		&pshufw	("mm3","mm1",0xb1);	&pshufw	("mm7","mm5",0xb1);
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp4
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= ROTATE(tp4,16)
-
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&pcmpgtb("mm3","mm1");		&pcmpgtb("mm7","mm5");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm3");		&pxor	("mm5","mm7");	# tp8
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&pshufw	("mm2","mm1",0xb1);	&pshufw	("mm6","mm5",0xb1);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= ROTATE(tp8,16)
-		&pslld	("mm1",8);		&pslld	("mm5",8);
-		&psrld	("mm3",8);		&psrld	("mm7",8);
-		&movq	("mm2",&QWP(0,$key));	&movq	("mm6",&QWP(8,$key));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8<<8
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp8>>8
-		&mov	($s0,&DWP(0-128,$tbl));
-		&pslld	("mm1",16);		&pslld	("mm5",16);
-		&mov	($s1,&DWP(64-128,$tbl));
-		&psrld	("mm3",16);		&psrld	("mm7",16);
-		&mov	($s2,&DWP(128-128,$tbl));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8<<24
-		&mov	($s3,&DWP(192-128,$tbl));
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp8>>24
-
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");
-	&jmp	(&label("loop"));
-
-	&set_label("out",16);
-	&pxor	("mm0",&QWP(0,$key));
-	&pxor	("mm4",&QWP(8,$key));
-
-	&ret	();
-&function_end_B("_sse_AES_decrypt_compact");
-					}
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub decstep()
-{ my ($i,$td,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# no instructions are reordered, as performance appears
-	# optimal... or rather that all attempts to reorder didn't
-	# result in better performance [which by the way is not a
-	# bit lower than encryption].
-	if($i==3)   {	&mov	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&mov	($out,&DWP(0,$td,$out,8));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&xor	($out,&DWP(3,$td,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	&mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&xor	($out,&DWP(2,$td,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&xor	($out,&DWP(1,$td,$tmp,8));
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$__s0);			}
-			&comment();
-}
-
-sub declast()
-{ my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	if($i==0)   {	&lea	($td,&DWP(2048+128,$td));
-			&mov	($tmp,&DWP(0-128,$td));
-			&mov	($acc,&DWP(32-128,$td));
-			&mov	($tmp,&DWP(64-128,$td));
-			&mov	($acc,&DWP(96-128,$td));
-			&mov	($tmp,&DWP(128-128,$td));
-			&mov	($acc,&DWP(160-128,$td));
-			&mov	($tmp,&DWP(192-128,$td));
-			&mov	($acc,&DWP(224-128,$td));
-			&lea	($td,&DWP(-128,$td));		}
-	if($i==3)   {	&mov	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&movz	($out,&BP(0,$td,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$__s0);
-			&lea	($td,&DWP(-2048,$td));		}
-}
-
-&function_begin_B("_x86_AES_decrypt");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	if ($small_footprint) {
-	    &lea	($acc,&DWP(-2,$acc,$acc));
-	    &lea	($acc,&DWP(0,$key,$acc,8));
-	    &mov	($__end,$acc);		# end of key schedule
-	    &set_label("loop",16);
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-	    &cmp	($key,$__end);
-	    &mov	($__key,$key);
-	    &jb		(&label("loop"));
-	}
-	else {
-	    &cmp	($acc,10);
-	    &jle	(&label("10rounds"));
-	    &cmp	($acc,12);
-	    &jle	(&label("12rounds"));
-
-	&set_label("14rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("12rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("10rounds",4);
-	    for ($i=1;$i<10;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	}
-
-	&declast(0,$tbl,$s0,$s3,$s2,$s1);
-	&declast(1,$tbl,$s1,$s0,$s3,$s2);
-	&declast(2,$tbl,$s2,$s1,$s0,$s3);
-	&declast(3,$tbl,$s3,$s2,$s1,$s0);
-
-	&add	($key,$small_footprint?16:160);
-	&xor	($s0,&DWP(0,$key));
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&ret	();
-
-&set_label("AES_Td",64);	# Yes! I keep it in the code segment!
-	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-&function_end_B("_x86_AES_decrypt");
-
-# void aes_nohw_decrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("aes_nohw_decrypt");
-	&mov	($acc,&wparam(0));		# load inp
-	&mov	($key,&wparam(2));		# load key
-
-	&mov	($s0,"esp");
-	&sub	("esp",36);
-	&and	("esp",-64);			# align to cache-line
-
-	# place stack frame just "above" the key schedule
-	&lea	($s1,&DWP(-64-63,$key));
-	&sub	($s1,"esp");
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	("esp",$s1);
-	&add	("esp",4);	# 4 is reserved for caller's return address
-	&mov	($_esp,$s0);	# save stack pointer
-
-	&call   (&label("pic_point"));          # make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-	&lea    ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
-
-	# pick Td4 copy which can't "overlap" with stack frame or key schedule
-	&lea	($s1,&DWP(768-4,"esp"));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),25);	# check for SSE bit
-	&jnc	(&label("x86"));
-
-	&movq	("mm0",&QWP(0,$acc));
-	&movq	("mm4",&QWP(8,$acc));
-	&call	("_sse_AES_decrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&movq	(&QWP(0,$acc),"mm0");		# write output data
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&function_end_A();
-					}
-	&set_label("x86",16);
-	&mov	($_tbl,$tbl);
-	&mov	($s0,&DWP(0,$acc));		# load input data
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-	&call	("_x86_AES_decrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&mov	(&DWP(0,$acc),$s0);		# write output data
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-&function_end("aes_nohw_decrypt");
-
-# void aes_nohw_cbc_encrypt (const void char *inp, unsigned char *out,
-#			    size_t length, const AES_KEY *key,
-#			    unsigned char *ivp,const int enc);
-{
-# stack frame layout
-#             -4(%esp)		# return address	 0(%esp)
-#              0(%esp)		# s0 backing store	 4(%esp)
-#              4(%esp)		# s1 backing store	 8(%esp)
-#              8(%esp)		# s2 backing store	12(%esp)
-#             12(%esp)		# s3 backing store	16(%esp)
-#             16(%esp)		# key backup		20(%esp)
-#             20(%esp)		# end of key schedule	24(%esp)
-#             24(%esp)		# %ebp backup		28(%esp)
-#             28(%esp)		# %esp backup
-my $_inp=&DWP(32,"esp");	# copy of wparam(0)
-my $_out=&DWP(36,"esp");	# copy of wparam(1)
-my $_len=&DWP(40,"esp");	# copy of wparam(2)
-my $_key=&DWP(44,"esp");	# copy of wparam(3)
-my $_ivp=&DWP(48,"esp");	# copy of wparam(4)
-my $_tmp=&DWP(52,"esp");	# volatile variable
-#
-my $ivec=&DWP(60,"esp");	# ivec[16]
-my $aes_key=&DWP(76,"esp");	# copy of aes_key
-my $mark=&DWP(76+240,"esp");	# copy of aes_key->rounds
-
-&function_begin("aes_nohw_cbc_encrypt");
-	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
-	&cmp	($s2,0);
-	&je	(&label("drop_out"));
-
-	&call   (&label("pic_point"));		# make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-
-	&cmp	(&wparam(5),0);
-	&lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-	&jne	(&label("picked_te"));
-	&lea	($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
-	&set_label("picked_te");
-
-	# one can argue if this is required
-	&pushf	();
-	&cld	();
-
-	&cmp	($s2,$speed_limit);
-	&jb	(&label("slow_way"));
-	&test	($s2,15);
-	&jnz	(&label("slow_way"));
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),28);	# check for hyper-threading bit
-	&jc	(&label("slow_way"));
-					}
-	# pre-allocate aligned stack frame...
-	&lea	($acc,&DWP(-80-244,"esp"));
-	&and	($acc,-64);
-
-	# ... and make sure it doesn't alias with $tbl modulo 4096
-	&mov	($s0,$tbl);
-	&lea	($s1,&DWP(2048+256,$tbl));
-	&mov	($s3,$acc);
-	&and	($s0,0xfff);		# s = %ebp&0xfff
-	&and	($s1,0xfff);		# e = (%ebp+2048+256)&0xfff
-	&and	($s3,0xfff);		# p = %esp&0xfff
-
-	&cmp	($s3,$s1);		# if (p>=e) %esp =- (p-e);
-	&jb	(&label("tbl_break_out"));
-	&sub	($s3,$s1);
-	&sub	($acc,$s3);
-	&jmp	(&label("tbl_ok"));
-	&set_label("tbl_break_out",4);	# else %esp -= (p-s)&0xfff + framesz;
-	&sub	($s3,$s0);
-	&and	($s3,0xfff);
-	&add	($s3,384);
-	&sub	($acc,$s3);
-	&set_label("tbl_ok",4);
-
-	&lea	($s3,&wparam(0));	# obtain pointer to parameter block
-	&exch	("esp",$acc);		# allocate stack frame
-	&add	("esp",4);		# reserve for return address!
-	&mov	($_tbl,$tbl);		# save %ebp
-	&mov	($_esp,$acc);		# save %esp
-
-	&mov	($s0,&DWP(0,$s3));	# load inp
-	&mov	($s1,&DWP(4,$s3));	# load out
-	#&mov	($s2,&DWP(8,$s3));	# load len
-	&mov	($key,&DWP(12,$s3));	# load key
-	&mov	($acc,&DWP(16,$s3));	# load ivp
-	&mov	($s3,&DWP(20,$s3));	# load enc flag
-
-	&mov	($_inp,$s0);		# save copy of inp
-	&mov	($_out,$s1);		# save copy of out
-	&mov	($_len,$s2);		# save copy of len
-	&mov	($_key,$key);		# save copy of key
-	&mov	($_ivp,$acc);		# save copy of ivp
-
-	&mov	($mark,0);		# copy of aes_key->rounds = 0;
-	# do we copy key schedule to stack?
-	&mov	($s1 eq "ebx" ? $s1 : "",$key);
-	&mov	($s2 eq "ecx" ? $s2 : "",244/4);
-	&sub	($s1,$tbl);
-	&mov	("esi",$key);
-	&and	($s1,0xfff);
-	&lea	("edi",$aes_key);
-	&cmp	($s1,2048+256);
-	&jb	(&label("do_copy"));
-	&cmp	($s1,4096-244);
-	&jb	(&label("skip_copy"));
-	&set_label("do_copy",4);
-		&mov	($_key,"edi");
-		&data_word(0xA5F3F689);	# rep movsd
-	&set_label("skip_copy");
-
-	&mov	($key,16);
-	&set_label("prefetch_tbl",4);
-		&mov	($s0,&DWP(0,$tbl));
-		&mov	($s1,&DWP(32,$tbl));
-		&mov	($s2,&DWP(64,$tbl));
-		&mov	($acc,&DWP(96,$tbl));
-		&lea	($tbl,&DWP(128,$tbl));
-		&sub	($key,1);
-	&jnz	(&label("prefetch_tbl"));
-	&sub	($tbl,2048);
-
-	&mov	($acc,$_inp);
-	&mov	($key,$_ivp);
-
-	&cmp	($s3,0);
-	&je	(&label("fast_decrypt"));
-
-#----------------------------- ENCRYPT -----------------------------#
-	&mov	($s0,&DWP(0,$key));		# load iv
-	&mov	($s1,&DWP(4,$key));
-
-	&set_label("fast_enc_loop",16);
-		&mov	($s2,&DWP(8,$key));
-		&mov	($s3,&DWP(12,$key));
-
-		&xor	($s0,&DWP(0,$acc));	# xor input data
-		&xor	($s1,&DWP(4,$acc));
-		&xor	($s2,&DWP(8,$acc));
-		&xor	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_encrypt");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-
-		&mov	(&DWP(0,$key),$s0);	# save output data
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($s2,$_len);		# load len
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_enc_loop"));
-	&mov	($acc,$_ivp);		# load ivp
-	&mov	($s2,&DWP(8,$key));	# restore last 2 dwords
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# save ivec
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&cmp	($mark,0);		# was the key schedule copied?
-	&mov	("edi",$_key);
-	&je	(&label("skip_ezero"));
-	# zero copy of key schedule
-	&mov	("ecx",240/4);
-	&xor	("eax","eax");
-	&align	(4);
-	&data_word(0xABF3F689);		# rep stosd
-	&set_label("skip_ezero");
-	&mov	("esp",$_esp);
-	&popf	();
-    &set_label("drop_out");
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-#----------------------------- DECRYPT -----------------------------#
-&set_label("fast_decrypt",16);
-
-	&cmp	($acc,$_out);
-	&je	(&label("fast_dec_in_place"));	# in-place processing...
-
-	&mov	($_tmp,$key);
-
-	&align	(4);
-	&set_label("fast_dec_loop",16);
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt");
-
-		&mov	($key,$_tmp);		# load ivp
-		&mov	($acc,$_len);		# load len
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&mov	($key,$_out);		# load out
-		&mov	($acc,$_inp);		# load inp
-
-		&mov	(&DWP(0,$key),$s0);	# write output
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($s2,$_len);		# load len
-		&mov	($_tmp,$acc);		# save ivp
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($key,&DWP(16,$key));	# advance out
-		&mov	($_out,$key);		# save out
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_dec_loop"));
-	&mov	($key,$_tmp);		# load temp ivp
-	&mov	($acc,$_ivp);		# load user ivp
-	&mov	($s0,&DWP(0,$key));	# load iv
-	&mov	($s1,&DWP(4,$key));
-	&mov	($s2,&DWP(8,$key));
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# copy back to user
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-	&jmp	(&label("fast_dec_out"));
-
-    &set_label("fast_dec_in_place",16);
-	&set_label("fast_dec_in_place_loop");
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&lea	($key,$ivec);
-		&mov	(&DWP(0,$key),$s0);	# copy to temp
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt");
-
-		&mov	($key,$_ivp);		# load ivp
-		&mov	($acc,$_out);		# load out
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&mov	(&DWP(0,$acc),$s0);	# write output
-		&mov	(&DWP(4,$acc),$s1);
-		&mov	(&DWP(8,$acc),$s2);
-		&mov	(&DWP(12,$acc),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance out
-		&mov	($_out,$acc);		# save out
-
-		&lea	($acc,$ivec);
-		&mov	($s0,&DWP(0,$acc));	# read temp
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	(&DWP(0,$key),$s0);	# copy iv
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($s2,$_len);		# load len
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_dec_in_place_loop"));
-
-    &set_label("fast_dec_out",4);
-	&cmp	($mark,0);		# was the key schedule copied?
-	&mov	("edi",$_key);
-	&je	(&label("skip_dzero"));
-	# zero copy of key schedule
-	&mov	("ecx",240/4);
-	&xor	("eax","eax");
-	&align	(4);
-	&data_word(0xABF3F689);		# rep stosd
-	&set_label("skip_dzero");
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-&set_label("slow_way",16);
-
-	&mov	($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
-	&mov	($key,&wparam(3));	# load key
-
-	# pre-allocate aligned stack frame...
-	&lea	($acc,&DWP(-80,"esp"));
-	&and	($acc,-64);
-
-	# ... and make sure it doesn't alias with $key modulo 1024
-	&lea	($s1,&DWP(-80-63,$key));
-	&sub	($s1,$acc);
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	($acc,$s1);
-
-	# pick S-box copy which can't overlap with stack frame or $key
-	&lea	($s1,&DWP(768,$acc));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-	&lea	($s3,&wparam(0));	# pointer to parameter block
-
-	&exch	("esp",$acc);
-	&add	("esp",4);		# reserve for return address!
-	&mov	($_tbl,$tbl);		# save %ebp
-	&mov	($_esp,$acc);		# save %esp
-	&mov	($_tmp,$s0);		# save OPENSSL_ia32cap
-
-	&mov	($s0,&DWP(0,$s3));	# load inp
-	&mov	($s1,&DWP(4,$s3));	# load out
-	#&mov	($s2,&DWP(8,$s3));	# load len
-	#&mov	($key,&DWP(12,$s3));	# load key
-	&mov	($acc,&DWP(16,$s3));	# load ivp
-	&mov	($s3,&DWP(20,$s3));	# load enc flag
-
-	&mov	($_inp,$s0);		# save copy of inp
-	&mov	($_out,$s1);		# save copy of out
-	&mov	($_len,$s2);		# save copy of len
-	&mov	($_key,$key);		# save copy of key
-	&mov	($_ivp,$acc);		# save copy of ivp
-
-	&mov	($key,$acc);
-	&mov	($acc,$s0);
-
-	&cmp	($s3,0);
-	&je	(&label("slow_decrypt"));
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-	&cmp	($s2,16);
-	&mov	($s3,$s1);
-	&jb	(&label("slow_enc_tail"));
-
-					if (!$x86only) {
-	&bt	($_tmp,25);		# check for SSE bit
-	&jnc	(&label("slow_enc_x86"));
-
-	&movq	("mm0",&QWP(0,$key));	# load iv
-	&movq	("mm4",&QWP(8,$key));
-
-	&set_label("slow_enc_loop_sse",16);
-		&pxor	("mm0",&QWP(0,$acc));	# xor input data
-		&pxor	("mm4",&QWP(8,$acc));
-
-		&mov	($key,$_key);
-		&call	("_sse_AES_encrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-		&mov	($s2,$_len);		# load len
-
-		&movq	(&QWP(0,$key),"mm0");	# save output data
-		&movq	(&QWP(8,$key),"mm4");
-
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&cmp	($s2,16);
-		&mov	($_len,$s2);		# save len
-	&jae	(&label("slow_enc_loop_sse"));
-	&test	($s2,15);
-	&jnz	(&label("slow_enc_tail"));
-	&mov	($acc,$_ivp);		# load ivp
-	&movq	(&QWP(0,$acc),"mm0");	# save ivec
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-					}
-    &set_label("slow_enc_x86",16);
-	&mov	($s0,&DWP(0,$key));	# load iv
-	&mov	($s1,&DWP(4,$key));
-
-	&set_label("slow_enc_loop_x86",4);
-		&mov	($s2,&DWP(8,$key));
-		&mov	($s3,&DWP(12,$key));
-
-		&xor	($s0,&DWP(0,$acc));	# xor input data
-		&xor	($s1,&DWP(4,$acc));
-		&xor	($s2,&DWP(8,$acc));
-		&xor	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_encrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-
-		&mov	(&DWP(0,$key),$s0);	# save output data
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($s2,$_len);		# load len
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&cmp	($s2,16);
-		&mov	($_len,$s2);		# save len
-	&jae	(&label("slow_enc_loop_x86"));
-	&test	($s2,15);
-	&jnz	(&label("slow_enc_tail"));
-	&mov	($acc,$_ivp);		# load ivp
-	&mov	($s2,&DWP(8,$key));	# restore last dwords
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# save ivec
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_enc_tail",16);
-	&emms	()	if (!$x86only);
-	&mov	($key eq "edi"? $key:"",$s3);	# load out to edi
-	&mov	($s1,16);
-	&sub	($s1,$s2);
-	&cmp	($key,$acc eq "esi"? $acc:"");	# compare with inp
-	&je	(&label("enc_in_place"));
-	&align	(4);
-	&data_word(0xA4F3F689);	# rep movsb	# copy input
-	&jmp	(&label("enc_skip_in_place"));
-    &set_label("enc_in_place");
-	&lea	($key,&DWP(0,$key,$s2));
-    &set_label("enc_skip_in_place");
-	&mov	($s2,$s1);
-	&xor	($s0,$s0);
-	&align	(4);
-	&data_word(0xAAF3F689);	# rep stosb	# zero tail
-
-	&mov	($key,$_ivp);			# restore ivp
-	&mov	($acc,$s3);			# output as input
-	&mov	($s0,&DWP(0,$key));
-	&mov	($s1,&DWP(4,$key));
-	&mov	($_len,16);			# len=16
-	&jmp	(&label("slow_enc_loop_x86"));	# one more spin...
-
-#--------------------------- SLOW DECRYPT ---------------------------#
-&set_label("slow_decrypt",16);
-					if (!$x86only) {
-	&bt	($_tmp,25);		# check for SSE bit
-	&jnc	(&label("slow_dec_loop_x86"));
-
-	&set_label("slow_dec_loop_sse",4);
-		&movq	("mm0",&QWP(0,$acc));	# read input
-		&movq	("mm4",&QWP(8,$acc));
-
-		&mov	($key,$_key);
-		&call	("_sse_AES_decrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&lea	($s0,$ivec);
-		&mov	($s1,$_out);		# load out
-		&mov	($s2,$_len);		# load len
-		&mov	($key,$_ivp);		# load ivp
-
-		&movq	("mm1",&QWP(0,$acc));	# re-read input
-		&movq	("mm5",&QWP(8,$acc));
-
-		&pxor	("mm0",&QWP(0,$key));	# xor iv
-		&pxor	("mm4",&QWP(8,$key));
-
-		&movq	(&QWP(0,$key),"mm1");	# copy input to iv
-		&movq	(&QWP(8,$key),"mm5");
-
-		&sub	($s2,16);		# decrease len
-		&jc	(&label("slow_dec_partial_sse"));
-
-		&movq	(&QWP(0,$s1),"mm0");	# write output
-		&movq	(&QWP(8,$s1),"mm4");
-
-		&lea	($s1,&DWP(16,$s1));	# advance out
-		&mov	($_out,$s1);		# save out
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("slow_dec_loop_sse"));
-	&emms	();
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_dec_partial_sse",16);
-	&movq	(&QWP(0,$s0),"mm0");	# save output to temp
-	&movq	(&QWP(8,$s0),"mm4");
-	&emms	();
-
-	&add	($s2 eq "ecx" ? "ecx":"",16);
-	&mov	("edi",$s1);		# out
-	&mov	("esi",$s0);		# temp
-	&align	(4);
-	&data_word(0xA4F3F689);		# rep movsb # copy partial output
-
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-					}
-	&set_label("slow_dec_loop_x86",16);
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&lea	($key,$ivec);
-		&mov	(&DWP(0,$key),$s0);	# copy to temp
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt_compact");
-
-		&mov	($key,$_ivp);		# load ivp
-		&mov	($acc,$_len);		# load len
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&sub	($acc,16);
-		&jc	(&label("slow_dec_partial_x86"));
-
-		&mov	($_len,$acc);		# save len
-		&mov	($acc,$_out);		# load out
-
-		&mov	(&DWP(0,$acc),$s0);	# write output
-		&mov	(&DWP(4,$acc),$s1);
-		&mov	(&DWP(8,$acc),$s2);
-		&mov	(&DWP(12,$acc),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance out
-		&mov	($_out,$acc);		# save out
-
-		&lea	($acc,$ivec);
-		&mov	($s0,&DWP(0,$acc));	# read temp
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	(&DWP(0,$key),$s0);	# copy it to iv
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($acc,$_inp);		# load inp
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-	&jnz	(&label("slow_dec_loop_x86"));
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_dec_partial_x86",16);
-	&lea	($acc,$ivec);
-	&mov	(&DWP(0,$acc),$s0);	# save output to temp
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&mov	($acc,$_inp);
-	&mov	($s0,&DWP(0,$acc));	# re-read input
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-
-	&mov	(&DWP(0,$key),$s0);	# copy it to iv
-	&mov	(&DWP(4,$key),$s1);
-	&mov	(&DWP(8,$key),$s2);
-	&mov	(&DWP(12,$key),$s3);
-
-	&mov	("ecx",$_len);
-	&mov	("edi",$_out);
-	&lea	("esi",$ivec);
-	&align	(4);
-	&data_word(0xA4F3F689);		# rep movsb # copy partial output
-
-	&mov	("esp",$_esp);
-	&popf	();
-&function_end("aes_nohw_cbc_encrypt");
-}
-
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-	&movz	("esi",&LB("edx"));		# rk[i]>>0
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&movz	("esi",&HB("edx"));		# rk[i]>>8
-	&shl	("ebx",24);
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&shr	("edx",16);
-	&movz	("esi",&LB("edx"));		# rk[i]>>16
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&movz	("esi",&HB("edx"));		# rk[i]>>24
-	&shl	("ebx",8);
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&shl	("ebx",16);
-	&xor	("eax","ebx");
-
-	&xor	("eax",&DWP(1024-128,$tbl,"ecx",4));	# rcon
-}
-
-&function_begin("_x86_AES_set_encrypt_key");
-	&mov	("esi",&wparam(1));		# user supplied key
-	&mov	("edi",&wparam(3));		# private key schedule
-
-	&test	("esi",-1);
-	&jz	(&label("badpointer"));
-	&test	("edi",-1);
-	&jz	(&label("badpointer"));
-
-	&call	(&label("pic_point"));
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&lea	($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-	&lea	($tbl,&DWP(2048+128,$tbl));
-
-	# prefetch Te4
-	&mov	("eax",&DWP(0-128,$tbl));
-	&mov	("ebx",&DWP(32-128,$tbl));
-	&mov	("ecx",&DWP(64-128,$tbl));
-	&mov	("edx",&DWP(96-128,$tbl));
-	&mov	("eax",&DWP(128-128,$tbl));
-	&mov	("ebx",&DWP(160-128,$tbl));
-	&mov	("ecx",&DWP(192-128,$tbl));
-	&mov	("edx",&DWP(224-128,$tbl));
-
-	&mov	("ecx",&wparam(2));		# number of bits in key
-	&cmp	("ecx",128);
-	&je	(&label("10rounds"));
-	&cmp	("ecx",192);
-	&je	(&label("12rounds"));
-	&cmp	("ecx",256);
-	&je	(&label("14rounds"));
-	&mov	("eax",-2);			# invalid number of bits
-	&jmp	(&label("exit"));
-
-    &set_label("10rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 4 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("10shortcut"));
-
-	&align	(4);
-	&set_label("10loop");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-		&mov	("edx",&DWP(12,"edi"));		# rk[3]
-	&set_label("10shortcut");
-		&enckey	();
-
-		&mov	(&DWP(16,"edi"),"eax");		# rk[4]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(20,"edi"),"eax");		# rk[5]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(24,"edi"),"eax");		# rk[6]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(28,"edi"),"eax");		# rk[7]
-		&inc	("ecx");
-		&add	("edi",16);
-		&cmp	("ecx",10);
-	&jl	(&label("10loop"));
-
-	&mov	(&DWP(80,"edi"),10);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("12rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 6 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-	&mov	("ecx",&DWP(16,"esi"));
-	&mov	("edx",&DWP(20,"esi"));
-	&mov	(&DWP(16,"edi"),"ecx");
-	&mov	(&DWP(20,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("12shortcut"));
-
-	&align	(4);
-	&set_label("12loop");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-		&mov	("edx",&DWP(20,"edi"));		# rk[5]
-	&set_label("12shortcut");
-		&enckey	();
-
-		&mov	(&DWP(24,"edi"),"eax");		# rk[6]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(28,"edi"),"eax");		# rk[7]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(32,"edi"),"eax");		# rk[8]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(36,"edi"),"eax");		# rk[9]
-
-		&cmp	("ecx",7);
-		&je	(&label("12break"));
-		&inc	("ecx");
-
-		&xor	("eax",&DWP(16,"edi"));
-		&mov	(&DWP(40,"edi"),"eax");		# rk[10]
-		&xor	("eax",&DWP(20,"edi"));
-		&mov	(&DWP(44,"edi"),"eax");		# rk[11]
-
-		&add	("edi",24);
-	&jmp	(&label("12loop"));
-
-	&set_label("12break");
-	&mov	(&DWP(72,"edi"),12);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("14rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 8 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-	&mov	("eax",&DWP(16,"esi"));
-	&mov	("ebx",&DWP(20,"esi"));
-	&mov	("ecx",&DWP(24,"esi"));
-	&mov	("edx",&DWP(28,"esi"));
-	&mov	(&DWP(16,"edi"),"eax");
-	&mov	(&DWP(20,"edi"),"ebx");
-	&mov	(&DWP(24,"edi"),"ecx");
-	&mov	(&DWP(28,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("14shortcut"));
-
-	&align	(4);
-	&set_label("14loop");
-		&mov	("edx",&DWP(28,"edi"));		# rk[7]
-	&set_label("14shortcut");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-
-		&enckey	();
-
-		&mov	(&DWP(32,"edi"),"eax");		# rk[8]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(36,"edi"),"eax");		# rk[9]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(40,"edi"),"eax");		# rk[10]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(44,"edi"),"eax");		# rk[11]
-
-		&cmp	("ecx",6);
-		&je	(&label("14break"));
-		&inc	("ecx");
-
-		&mov	("edx","eax");
-		&mov	("eax",&DWP(16,"edi"));		# rk[4]
-		&movz	("esi",&LB("edx"));		# rk[11]>>0
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&movz	("esi",&HB("edx"));		# rk[11]>>8
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&shr	("edx",16);
-		&shl	("ebx",8);
-		&movz	("esi",&LB("edx"));		# rk[11]>>16
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&movz	("esi",&HB("edx"));		# rk[11]>>24
-		&shl	("ebx",16);
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&shl	("ebx",24);
-		&xor	("eax","ebx");
-
-		&mov	(&DWP(48,"edi"),"eax");		# rk[12]
-		&xor	("eax",&DWP(20,"edi"));
-		&mov	(&DWP(52,"edi"),"eax");		# rk[13]
-		&xor	("eax",&DWP(24,"edi"));
-		&mov	(&DWP(56,"edi"),"eax");		# rk[14]
-		&xor	("eax",&DWP(28,"edi"));
-		&mov	(&DWP(60,"edi"),"eax");		# rk[15]
-
-		&add	("edi",32);
-	&jmp	(&label("14loop"));
-
-	&set_label("14break");
-	&mov	(&DWP(48,"edi"),14);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("badpointer");
-	&mov	("eax",-1);
-    &set_label("exit");
-&function_end("_x86_AES_set_encrypt_key");
-
-# int aes_nohw_set_encrypt_key(const unsigned char *userKey, const int bits,
-#                              AES_KEY *key)
-&function_begin_B("aes_nohw_set_encrypt_key");
-	&call	("_x86_AES_set_encrypt_key");
-	&ret	();
-&function_end_B("aes_nohw_set_encrypt_key");
-
-sub deckey()
-{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
-  my $tmp = $tbl;
-
-	&mov	($tmp,0x80808080);
-	&and	($tmp,$tp1);
-	&lea	($tp2,&DWP(0,$tp1,$tp1));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&sub	($acc,$tmp);
-	&and	($tp2,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	&xor	($tp2,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp2);
-	&lea	($tp4,&DWP(0,$tp2,$tp2));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&sub	($acc,$tmp);
-	&and	($tp4,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp2,$tp1);	# tp2^tp1
-	&xor	($tp4,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp4);
-	&lea	($tp8,&DWP(0,$tp4,$tp4));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	 &xor	($tp4,$tp1);	# tp4^tp1
-	&sub	($acc,$tmp);
-	&and	($tp8,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &rotl	($tp1,8);	# = ROTATE(tp1,8)
-	&xor	($tp8,$acc);
-
-	&mov	($tmp,&DWP(4*($i+1),$key));	# modulo-scheduled load
-
-	&xor	($tp1,$tp2);
-	&xor	($tp2,$tp8);
-	&xor	($tp1,$tp4);
-	&rotl	($tp2,24);
-	&xor	($tp4,$tp8);
-	&xor	($tp1,$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
-	&rotl	($tp4,16);
-	&xor	($tp1,$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
-	&rotl	($tp8,8);
-	&xor	($tp1,$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
-	&mov	($tp2,$tmp);
-	&xor	($tp1,$tp8);	# ^= ROTATE(tp8,8)
-
-	&mov	(&DWP(4*$i,$key),$tp1);
-}
-
-# int aes_nohw_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                              AES_KEY *key)
-&function_begin_B("aes_nohw_set_decrypt_key");
-	&call	("_x86_AES_set_encrypt_key");
-	&cmp	("eax",0);
-	&je	(&label("proceed"));
-	&ret	();
-
-    &set_label("proceed");
-	&push	("ebp");
-	&push	("ebx");
-	&push	("esi");
-	&push	("edi");
-
-	&mov	("esi",&wparam(2));
-	&mov	("ecx",&DWP(240,"esi"));	# pull number of rounds
-	&lea	("ecx",&DWP(0,"","ecx",4));
-	&lea	("edi",&DWP(0,"esi","ecx",4));	# pointer to last chunk
-
-	&set_label("invert",4);			# invert order of chunks
-		&mov	("eax",&DWP(0,"esi"));
-		&mov	("ebx",&DWP(4,"esi"));
-		&mov	("ecx",&DWP(0,"edi"));
-		&mov	("edx",&DWP(4,"edi"));
-		&mov	(&DWP(0,"edi"),"eax");
-		&mov	(&DWP(4,"edi"),"ebx");
-		&mov	(&DWP(0,"esi"),"ecx");
-		&mov	(&DWP(4,"esi"),"edx");
-		&mov	("eax",&DWP(8,"esi"));
-		&mov	("ebx",&DWP(12,"esi"));
-		&mov	("ecx",&DWP(8,"edi"));
-		&mov	("edx",&DWP(12,"edi"));
-		&mov	(&DWP(8,"edi"),"eax");
-		&mov	(&DWP(12,"edi"),"ebx");
-		&mov	(&DWP(8,"esi"),"ecx");
-		&mov	(&DWP(12,"esi"),"edx");
-		&add	("esi",16);
-		&sub	("edi",16);
-		&cmp	("esi","edi");
-	&jne	(&label("invert"));
-
-	&mov	($key,&wparam(2));
-	&mov	($acc,&DWP(240,$key));		# pull number of rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	(&wparam(2),$acc);
-
-	&mov	($s0,&DWP(16,$key));		# modulo-scheduled load
-	&set_label("permute",4);		# permute the key schedule
-		&add	($key,16);
-		&deckey	(0,$key,$s0,$s1,$s2,$s3);
-		&deckey	(1,$key,$s1,$s2,$s3,$s0);
-		&deckey	(2,$key,$s2,$s3,$s0,$s1);
-		&deckey	(3,$key,$s3,$s0,$s1,$s2);
-		&cmp	($key,&wparam(2));
-	&jb	(&label("permute"));
-
-	&xor	("eax","eax");			# return success
-&function_end("aes_nohw_set_decrypt_key");
-&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
-
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/fipsmodule/aes/asm/aes-armv4.pl b/crypto/fipsmodule/aes/asm/aes-armv4.pl
deleted file mode 100644
index fbb1995..0000000
--- a/crypto/fipsmodule/aes/asm/aes-armv4.pl
+++ /dev/null
@@ -1,1252 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# AES for ARMv4
-
-# January 2007.
-#
-# Code uses single 1K S-box and is >2 times faster than code generated
-# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
-# allows to merge logical or arithmetic operation with shift or rotate
-# in one instruction and emit combined result every cycle. The module
-# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
-# key [on single-issue Xscale PXA250 core].
-
-# May 2007.
-#
-# AES_set_[en|de]crypt_key is added.
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 12% improvement on
-# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~21.5 cycles per byte.
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open OUT,"| \"$^X\" $xlate $flavour $output";
-    *STDOUT=*OUT;
-} else {
-    open OUT,">$output";
-    *STDOUT=*OUT;
-}
-
-$s0="r0";
-$s1="r1";
-$s2="r2";
-$s3="r3";
-$t1="r4";
-$t2="r5";
-$t3="r6";
-$i1="r7";
-$i2="r8";
-$i3="r9";
-
-$tbl="r10";
-$key="r11";
-$rounds="r12";
-
-$code=<<___;
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES
-@ instructions are in aesv8-armx.pl.)
-.arch  armv7-a
-
-.text
-#if defined(__thumb2__) && !defined(__APPLE__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#undef __thumb2__
-#endif
-
-.type	AES_Te,%object
-.align	5
-AES_Te:
-.word	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
-.word	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
-.word	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
-.word	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
-.word	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
-.word	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
-.word	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
-.word	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
-.word	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
-.word	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
-.word	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
-.word	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
-.word	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
-.word	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
-.word	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
-.word	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
-.word	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
-.word	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
-.word	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
-.word	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
-.word	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
-.word	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
-.word	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
-.word	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
-.word	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
-.word	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
-.word	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
-.word	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
-.word	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
-.word	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
-.word	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
-.word	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
-.word	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
-.word	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
-.word	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
-.word	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
-.word	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
-.word	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
-.word	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
-.word	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
-.word	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
-.word	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
-.word	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
-.word	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
-.word	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
-.word	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
-.word	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
-.word	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
-.word	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
-.word	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
-.word	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
-.word	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
-.word	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
-.word	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
-.word	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
-.word	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
-.word	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
-.word	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
-.word	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
-.word	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
-.word	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
-.word	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
-.word	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
-.word	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
-@ Te4[256]
-.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
-.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
-.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
-.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
-.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
-.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
-.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
-.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
-.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
-.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
-.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
-.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
-.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
-.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
-.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
-.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
-.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
-.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
-.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
-.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
-.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
-.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
-.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
-.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
-.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
-.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
-.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
-.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
-.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
-.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
-.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
-.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-@ rcon[]
-.word	0x01000000, 0x02000000, 0x04000000, 0x08000000
-.word	0x10000000, 0x20000000, 0x40000000, 0x80000000
-.word	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
-.size	AES_Te,.-AES_Te
-
-@ void aes_nohw_encrypt(const unsigned char *in, unsigned char *out,
-@ 		                  const AES_KEY *key) {
-.global aes_nohw_encrypt
-.type   aes_nohw_encrypt,%function
-.align	5
-aes_nohw_encrypt:
-#ifndef	__thumb2__
-	sub	r3,pc,#8		@ aes_nohw_encrypt
-#else
-	adr	r3,.
-#endif
-	stmdb   sp!,{r1,r4-r12,lr}
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	$tbl,AES_Te
-#else
-	sub	$tbl,r3,#aes_nohw_encrypt-AES_Te	@ Te
-#endif
-	mov	$rounds,r0		@ inp
-	mov	$key,r2
-#if __ARM_ARCH__<7
-	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
-	ldrb	$t1,[$rounds,#2]	@ manner...
-	ldrb	$t2,[$rounds,#1]
-	ldrb	$t3,[$rounds,#0]
-	orr	$s0,$s0,$t1,lsl#8
-	ldrb	$s1,[$rounds,#7]
-	orr	$s0,$s0,$t2,lsl#16
-	ldrb	$t1,[$rounds,#6]
-	orr	$s0,$s0,$t3,lsl#24
-	ldrb	$t2,[$rounds,#5]
-	ldrb	$t3,[$rounds,#4]
-	orr	$s1,$s1,$t1,lsl#8
-	ldrb	$s2,[$rounds,#11]
-	orr	$s1,$s1,$t2,lsl#16
-	ldrb	$t1,[$rounds,#10]
-	orr	$s1,$s1,$t3,lsl#24
-	ldrb	$t2,[$rounds,#9]
-	ldrb	$t3,[$rounds,#8]
-	orr	$s2,$s2,$t1,lsl#8
-	ldrb	$s3,[$rounds,#15]
-	orr	$s2,$s2,$t2,lsl#16
-	ldrb	$t1,[$rounds,#14]
-	orr	$s2,$s2,$t3,lsl#24
-	ldrb	$t2,[$rounds,#13]
-	ldrb	$t3,[$rounds,#12]
-	orr	$s3,$s3,$t1,lsl#8
-	orr	$s3,$s3,$t2,lsl#16
-	orr	$s3,$s3,$t3,lsl#24
-#else
-	ldr	$s0,[$rounds,#0]
-	ldr	$s1,[$rounds,#4]
-	ldr	$s2,[$rounds,#8]
-	ldr	$s3,[$rounds,#12]
-#ifdef __ARMEL__
-	rev	$s0,$s0
-	rev	$s1,$s1
-	rev	$s2,$s2
-	rev	$s3,$s3
-#endif
-#endif
-	bl	_armv4_AES_encrypt
-
-	ldr	$rounds,[sp],#4		@ pop out
-#if __ARM_ARCH__>=7
-#ifdef __ARMEL__
-	rev	$s0,$s0
-	rev	$s1,$s1
-	rev	$s2,$s2
-	rev	$s3,$s3
-#endif
-	str	$s0,[$rounds,#0]
-	str	$s1,[$rounds,#4]
-	str	$s2,[$rounds,#8]
-	str	$s3,[$rounds,#12]
-#else
-	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
-	mov	$t2,$s0,lsr#16		@ manner...
-	mov	$t3,$s0,lsr#8
-	strb	$t1,[$rounds,#0]
-	strb	$t2,[$rounds,#1]
-	mov	$t1,$s1,lsr#24
-	strb	$t3,[$rounds,#2]
-	mov	$t2,$s1,lsr#16
-	strb	$s0,[$rounds,#3]
-	mov	$t3,$s1,lsr#8
-	strb	$t1,[$rounds,#4]
-	strb	$t2,[$rounds,#5]
-	mov	$t1,$s2,lsr#24
-	strb	$t3,[$rounds,#6]
-	mov	$t2,$s2,lsr#16
-	strb	$s1,[$rounds,#7]
-	mov	$t3,$s2,lsr#8
-	strb	$t1,[$rounds,#8]
-	strb	$t2,[$rounds,#9]
-	mov	$t1,$s3,lsr#24
-	strb	$t3,[$rounds,#10]
-	mov	$t2,$s3,lsr#16
-	strb	$s2,[$rounds,#11]
-	mov	$t3,$s3,lsr#8
-	strb	$t1,[$rounds,#12]
-	strb	$t2,[$rounds,#13]
-	strb	$t3,[$rounds,#14]
-	strb	$s3,[$rounds,#15]
-#endif
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r12,pc}
-#else
-	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	aes_nohw_encrypt,.-aes_nohw_encrypt
-
-.type   _armv4_AES_encrypt,%function
-.align	2
-_armv4_AES_encrypt:
-	str	lr,[sp,#-4]!		@ push lr
-	ldmia	$key!,{$t1-$i1}
-	eor	$s0,$s0,$t1
-	ldr	$rounds,[$key,#240-16]
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
-	sub	$rounds,$rounds,#1
-	mov	lr,#255
-
-	and	$i1,lr,$s0
-	and	$i2,lr,$s0,lsr#8
-	and	$i3,lr,$s0,lsr#16
-	mov	$s0,$s0,lsr#24
-.Lenc_loop:
-	ldr	$t1,[$tbl,$i1,lsl#2]	@ Te3[s0>>0]
-	and	$i1,lr,$s1,lsr#16	@ i0
-	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
-	and	$i2,lr,$s1
-	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]
-	and	$i3,lr,$s1,lsr#8
-	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
-	mov	$s1,$s1,lsr#24
-
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te1[s1>>16]
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te3[s1>>0]
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te2[s1>>8]
-	eor	$s0,$s0,$i1,ror#8
-	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
-	and	$i1,lr,$s2,lsr#8	@ i0
-	eor	$t2,$t2,$i2,ror#8
-	and	$i2,lr,$s2,lsr#16	@ i1
-	eor	$t3,$t3,$i3,ror#8
-	and	$i3,lr,$s2
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
-	eor	$s1,$s1,$t1,ror#24
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
-	mov	$s2,$s2,lsr#24
-
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
-	eor	$s0,$s0,$i1,ror#16
-	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
-	and	$i1,lr,$s3		@ i0
-	eor	$s1,$s1,$i2,ror#8
-	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$t3,$i3,ror#16
-	and	$i3,lr,$s3,lsr#16	@ i2
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
-	eor	$s2,$s2,$t2,ror#16
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
-	mov	$s3,$s3,lsr#24
-
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
-	eor	$s0,$s0,$i1,ror#24
-	ldr	$i1,[$key],#16
-	eor	$s1,$s1,$i2,ror#16
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
-	eor	$s2,$s2,$i3,ror#8
-	ldr	$t1,[$key,#-12]
-	eor	$s3,$s3,$t3,ror#8
-
-	ldr	$t2,[$key,#-8]
-	eor	$s0,$s0,$i1
-	ldr	$t3,[$key,#-4]
-	and	$i1,lr,$s0
-	eor	$s1,$s1,$t1
-	and	$i2,lr,$s0,lsr#8
-	eor	$s2,$s2,$t2
-	and	$i3,lr,$s0,lsr#16
-	eor	$s3,$s3,$t3
-	mov	$s0,$s0,lsr#24
-
-	subs	$rounds,$rounds,#1
-	bne	.Lenc_loop
-
-	add	$tbl,$tbl,#2
-
-	ldrb	$t1,[$tbl,$i1,lsl#2]	@ Te4[s0>>0]
-	and	$i1,lr,$s1,lsr#16	@ i0
-	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
-	and	$i2,lr,$s1
-	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]
-	and	$i3,lr,$s1,lsr#8
-	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
-	mov	$s1,$s1,lsr#24
-
-	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s1>>16]
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s1>>0]
-	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s1>>8]
-	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
-	and	$i1,lr,$s2,lsr#8	@ i0
-	eor	$t2,$i2,$t2,lsl#8
-	and	$i2,lr,$s2,lsr#16	@ i1
-	eor	$t3,$i3,$t3,lsl#8
-	and	$i3,lr,$s2
-	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
-	eor	$s1,$t1,$s1,lsl#24
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
-	mov	$s2,$s2,lsr#24
-
-	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
-	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
-	and	$i1,lr,$s3		@ i0
-	eor	$s1,$s1,$i2,lsl#16
-	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$i3,$t3,lsl#8
-	and	$i3,lr,$s3,lsr#16	@ i2
-	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
-	eor	$s2,$t2,$s2,lsl#24
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
-	mov	$s3,$s3,lsr#24
-
-	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
-	eor	$s0,$i1,$s0,lsl#8
-	ldr	$i1,[$key,#0]
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
-	eor	$s1,$s1,$i2,lsl#8
-	ldr	$t1,[$key,#4]
-	eor	$s2,$s2,$i3,lsl#16
-	ldr	$t2,[$key,#8]
-	eor	$s3,$t3,$s3,lsl#24
-	ldr	$t3,[$key,#12]
-
-	eor	$s0,$s0,$i1
-	eor	$s1,$s1,$t1
-	eor	$s2,$s2,$t2
-	eor	$s3,$s3,$t3
-
-	sub	$tbl,$tbl,#2
-	ldr	pc,[sp],#4		@ pop and return
-.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
-
-.global aes_nohw_set_encrypt_key
-.type   aes_nohw_set_encrypt_key,%function
-.align	5
-aes_nohw_set_encrypt_key:
-_armv4_AES_set_encrypt_key:
-#ifndef	__thumb2__
-	sub	r3,pc,#8		@ aes_nohw_set_encrypt_key
-#else
-	adr	r3,.
-#endif
-	teq	r0,#0
-#ifdef	__thumb2__
-	itt	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	moveq	r0,#-1
-	beq	.Labrt
-	teq	r2,#0
-#ifdef	__thumb2__
-	itt	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	moveq	r0,#-1
-	beq	.Labrt
-
-	teq	r1,#128
-	beq	.Lok
-	teq	r1,#192
-	beq	.Lok
-	teq	r1,#256
-#ifdef	__thumb2__
-	itt	ne			@ Thumb2 thing, sanity check in ARM
-#endif
-	movne	r0,#-1
-	bne	.Labrt
-
-.Lok:	stmdb   sp!,{r4-r12,lr}
-	mov	$rounds,r0		@ inp
-	mov	lr,r1			@ bits
-	mov	$key,r2			@ key
-
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	$tbl,AES_Te+1024				@ Te4
-#else
-	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
-#endif
-
-#if __ARM_ARCH__<7
-	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
-	ldrb	$t1,[$rounds,#2]	@ manner...
-	ldrb	$t2,[$rounds,#1]
-	ldrb	$t3,[$rounds,#0]
-	orr	$s0,$s0,$t1,lsl#8
-	ldrb	$s1,[$rounds,#7]
-	orr	$s0,$s0,$t2,lsl#16
-	ldrb	$t1,[$rounds,#6]
-	orr	$s0,$s0,$t3,lsl#24
-	ldrb	$t2,[$rounds,#5]
-	ldrb	$t3,[$rounds,#4]
-	orr	$s1,$s1,$t1,lsl#8
-	ldrb	$s2,[$rounds,#11]
-	orr	$s1,$s1,$t2,lsl#16
-	ldrb	$t1,[$rounds,#10]
-	orr	$s1,$s1,$t3,lsl#24
-	ldrb	$t2,[$rounds,#9]
-	ldrb	$t3,[$rounds,#8]
-	orr	$s2,$s2,$t1,lsl#8
-	ldrb	$s3,[$rounds,#15]
-	orr	$s2,$s2,$t2,lsl#16
-	ldrb	$t1,[$rounds,#14]
-	orr	$s2,$s2,$t3,lsl#24
-	ldrb	$t2,[$rounds,#13]
-	ldrb	$t3,[$rounds,#12]
-	orr	$s3,$s3,$t1,lsl#8
-	str	$s0,[$key],#16
-	orr	$s3,$s3,$t2,lsl#16
-	str	$s1,[$key,#-12]
-	orr	$s3,$s3,$t3,lsl#24
-	str	$s2,[$key,#-8]
-	str	$s3,[$key,#-4]
-#else
-	ldr	$s0,[$rounds,#0]
-	ldr	$s1,[$rounds,#4]
-	ldr	$s2,[$rounds,#8]
-	ldr	$s3,[$rounds,#12]
-#ifdef __ARMEL__
-	rev	$s0,$s0
-	rev	$s1,$s1
-	rev	$s2,$s2
-	rev	$s3,$s3
-#endif
-	str	$s0,[$key],#16
-	str	$s1,[$key,#-12]
-	str	$s2,[$key,#-8]
-	str	$s3,[$key,#-4]
-#endif
-
-	teq	lr,#128
-	bne	.Lnot128
-	mov	$rounds,#10
-	str	$rounds,[$key,#240-16]
-	add	$t3,$tbl,#256			@ rcon
-	mov	lr,#255
-
-.L128_loop:
-	and	$t2,lr,$s3,lsr#24
-	and	$i1,lr,$s3,lsr#16
-	ldrb	$t2,[$tbl,$t2]
-	and	$i2,lr,$s3,lsr#8
-	ldrb	$i1,[$tbl,$i1]
-	and	$i3,lr,$s3
-	ldrb	$i2,[$tbl,$i2]
-	orr	$t2,$t2,$i1,lsl#24
-	ldrb	$i3,[$tbl,$i3]
-	orr	$t2,$t2,$i2,lsl#16
-	ldr	$t1,[$t3],#4			@ rcon[i++]
-	orr	$t2,$t2,$i3,lsl#8
-	eor	$t2,$t2,$t1
-	eor	$s0,$s0,$t2			@ rk[4]=rk[0]^...
-	eor	$s1,$s1,$s0			@ rk[5]=rk[1]^rk[4]
-	str	$s0,[$key],#16
-	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
-	str	$s1,[$key,#-12]
-	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
-	str	$s2,[$key,#-8]
-	subs	$rounds,$rounds,#1
-	str	$s3,[$key,#-4]
-	bne	.L128_loop
-	sub	r2,$key,#176
-	b	.Ldone
-
-.Lnot128:
-#if __ARM_ARCH__<7
-	ldrb	$i2,[$rounds,#19]
-	ldrb	$t1,[$rounds,#18]
-	ldrb	$t2,[$rounds,#17]
-	ldrb	$t3,[$rounds,#16]
-	orr	$i2,$i2,$t1,lsl#8
-	ldrb	$i3,[$rounds,#23]
-	orr	$i2,$i2,$t2,lsl#16
-	ldrb	$t1,[$rounds,#22]
-	orr	$i2,$i2,$t3,lsl#24
-	ldrb	$t2,[$rounds,#21]
-	ldrb	$t3,[$rounds,#20]
-	orr	$i3,$i3,$t1,lsl#8
-	orr	$i3,$i3,$t2,lsl#16
-	str	$i2,[$key],#8
-	orr	$i3,$i3,$t3,lsl#24
-	str	$i3,[$key,#-4]
-#else
-	ldr	$i2,[$rounds,#16]
-	ldr	$i3,[$rounds,#20]
-#ifdef __ARMEL__
-	rev	$i2,$i2
-	rev	$i3,$i3
-#endif
-	str	$i2,[$key],#8
-	str	$i3,[$key,#-4]
-#endif
-
-	teq	lr,#192
-	bne	.Lnot192
-	mov	$rounds,#12
-	str	$rounds,[$key,#240-24]
-	add	$t3,$tbl,#256			@ rcon
-	mov	lr,#255
-	mov	$rounds,#8
-
-.L192_loop:
-	and	$t2,lr,$i3,lsr#24
-	and	$i1,lr,$i3,lsr#16
-	ldrb	$t2,[$tbl,$t2]
-	and	$i2,lr,$i3,lsr#8
-	ldrb	$i1,[$tbl,$i1]
-	and	$i3,lr,$i3
-	ldrb	$i2,[$tbl,$i2]
-	orr	$t2,$t2,$i1,lsl#24
-	ldrb	$i3,[$tbl,$i3]
-	orr	$t2,$t2,$i2,lsl#16
-	ldr	$t1,[$t3],#4			@ rcon[i++]
-	orr	$t2,$t2,$i3,lsl#8
-	eor	$i3,$t2,$t1
-	eor	$s0,$s0,$i3			@ rk[6]=rk[0]^...
-	eor	$s1,$s1,$s0			@ rk[7]=rk[1]^rk[6]
-	str	$s0,[$key],#24
-	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
-	str	$s1,[$key,#-20]
-	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
-	str	$s2,[$key,#-16]
-	subs	$rounds,$rounds,#1
-	str	$s3,[$key,#-12]
-#ifdef	__thumb2__
-	itt	eq				@ Thumb2 thing, sanity check in ARM
-#endif
-	subeq	r2,$key,#216
-	beq	.Ldone
-
-	ldr	$i1,[$key,#-32]
-	ldr	$i2,[$key,#-28]
-	eor	$i1,$i1,$s3			@ rk[10]=rk[4]^rk[9]
-	eor	$i3,$i2,$i1			@ rk[11]=rk[5]^rk[10]
-	str	$i1,[$key,#-8]
-	str	$i3,[$key,#-4]
-	b	.L192_loop
-
-.Lnot192:
-#if __ARM_ARCH__<7
-	ldrb	$i2,[$rounds,#27]
-	ldrb	$t1,[$rounds,#26]
-	ldrb	$t2,[$rounds,#25]
-	ldrb	$t3,[$rounds,#24]
-	orr	$i2,$i2,$t1,lsl#8
-	ldrb	$i3,[$rounds,#31]
-	orr	$i2,$i2,$t2,lsl#16
-	ldrb	$t1,[$rounds,#30]
-	orr	$i2,$i2,$t3,lsl#24
-	ldrb	$t2,[$rounds,#29]
-	ldrb	$t3,[$rounds,#28]
-	orr	$i3,$i3,$t1,lsl#8
-	orr	$i3,$i3,$t2,lsl#16
-	str	$i2,[$key],#8
-	orr	$i3,$i3,$t3,lsl#24
-	str	$i3,[$key,#-4]
-#else
-	ldr	$i2,[$rounds,#24]
-	ldr	$i3,[$rounds,#28]
-#ifdef __ARMEL__
-	rev	$i2,$i2
-	rev	$i3,$i3
-#endif
-	str	$i2,[$key],#8
-	str	$i3,[$key,#-4]
-#endif
-
-	mov	$rounds,#14
-	str	$rounds,[$key,#240-32]
-	add	$t3,$tbl,#256			@ rcon
-	mov	lr,#255
-	mov	$rounds,#7
-
-.L256_loop:
-	and	$t2,lr,$i3,lsr#24
-	and	$i1,lr,$i3,lsr#16
-	ldrb	$t2,[$tbl,$t2]
-	and	$i2,lr,$i3,lsr#8
-	ldrb	$i1,[$tbl,$i1]
-	and	$i3,lr,$i3
-	ldrb	$i2,[$tbl,$i2]
-	orr	$t2,$t2,$i1,lsl#24
-	ldrb	$i3,[$tbl,$i3]
-	orr	$t2,$t2,$i2,lsl#16
-	ldr	$t1,[$t3],#4			@ rcon[i++]
-	orr	$t2,$t2,$i3,lsl#8
-	eor	$i3,$t2,$t1
-	eor	$s0,$s0,$i3			@ rk[8]=rk[0]^...
-	eor	$s1,$s1,$s0			@ rk[9]=rk[1]^rk[8]
-	str	$s0,[$key],#32
-	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
-	str	$s1,[$key,#-28]
-	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
-	str	$s2,[$key,#-24]
-	subs	$rounds,$rounds,#1
-	str	$s3,[$key,#-20]
-#ifdef	__thumb2__
-	itt	eq				@ Thumb2 thing, sanity check in ARM
-#endif
-	subeq	r2,$key,#256
-	beq	.Ldone
-
-	and	$t2,lr,$s3
-	and	$i1,lr,$s3,lsr#8
-	ldrb	$t2,[$tbl,$t2]
-	and	$i2,lr,$s3,lsr#16
-	ldrb	$i1,[$tbl,$i1]
-	and	$i3,lr,$s3,lsr#24
-	ldrb	$i2,[$tbl,$i2]
-	orr	$t2,$t2,$i1,lsl#8
-	ldrb	$i3,[$tbl,$i3]
-	orr	$t2,$t2,$i2,lsl#16
-	ldr	$t1,[$key,#-48]
-	orr	$t2,$t2,$i3,lsl#24
-
-	ldr	$i1,[$key,#-44]
-	ldr	$i2,[$key,#-40]
-	eor	$t1,$t1,$t2			@ rk[12]=rk[4]^...
-	ldr	$i3,[$key,#-36]
-	eor	$i1,$i1,$t1			@ rk[13]=rk[5]^rk[12]
-	str	$t1,[$key,#-16]
-	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
-	str	$i1,[$key,#-12]
-	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
-	str	$i2,[$key,#-8]
-	str	$i3,[$key,#-4]
-	b	.L256_loop
-
-.align	2
-.Ldone:	mov	r0,#0
-	ldmia   sp!,{r4-r12,lr}
-.Labrt:
-#if __ARM_ARCH__>=5
-	ret				@ bx lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key
-
-.global aes_nohw_set_decrypt_key
-.type   aes_nohw_set_decrypt_key,%function
-.align	5
-aes_nohw_set_decrypt_key:
-	str	lr,[sp,#-4]!            @ push lr
-	bl	_armv4_AES_set_encrypt_key
-	teq	r0,#0
-	ldr	lr,[sp],#4              @ pop lr
-	bne	.Labrt
-
-	mov	r0,r2			@ aes_nohw_set_encrypt_key preserves r2,
-	mov	r1,r2			@ which is AES_KEY *key
-	b	_armv4_AES_set_enc2dec_key
-.size	aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key
-
-@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
-.global	AES_set_enc2dec_key
-.type	AES_set_enc2dec_key,%function
-.align	5
-AES_set_enc2dec_key:
-_armv4_AES_set_enc2dec_key:
-	stmdb   sp!,{r4-r12,lr}
-
-	ldr	$rounds,[r0,#240]
-	mov	$i1,r0			@ input
-	add	$i2,r0,$rounds,lsl#4
-	mov	$key,r1			@ output
-	add	$tbl,r1,$rounds,lsl#4
-	str	$rounds,[r1,#240]
-
-.Linv:	ldr	$s0,[$i1],#16
-	ldr	$s1,[$i1,#-12]
-	ldr	$s2,[$i1,#-8]
-	ldr	$s3,[$i1,#-4]
-	ldr	$t1,[$i2],#-16
-	ldr	$t2,[$i2,#16+4]
-	ldr	$t3,[$i2,#16+8]
-	ldr	$i3,[$i2,#16+12]
-	str	$s0,[$tbl],#-16
-	str	$s1,[$tbl,#16+4]
-	str	$s2,[$tbl,#16+8]
-	str	$s3,[$tbl,#16+12]
-	str	$t1,[$key],#16
-	str	$t2,[$key,#-12]
-	str	$t3,[$key,#-8]
-	str	$i3,[$key,#-4]
-	teq	$i1,$i2
-	bne	.Linv
-
-	ldr	$s0,[$i1]
-	ldr	$s1,[$i1,#4]
-	ldr	$s2,[$i1,#8]
-	ldr	$s3,[$i1,#12]
-	str	$s0,[$key]
-	str	$s1,[$key,#4]
-	str	$s2,[$key,#8]
-	str	$s3,[$key,#12]
-	sub	$key,$key,$rounds,lsl#3
-___
-$mask80=$i1;
-$mask1b=$i2;
-$mask7f=$i3;
-$code.=<<___;
-	ldr	$s0,[$key,#16]!		@ prefetch tp1
-	mov	$mask80,#0x80
-	mov	$mask1b,#0x1b
-	orr	$mask80,$mask80,#0x8000
-	orr	$mask1b,$mask1b,#0x1b00
-	orr	$mask80,$mask80,$mask80,lsl#16
-	orr	$mask1b,$mask1b,$mask1b,lsl#16
-	sub	$rounds,$rounds,#1
-	mvn	$mask7f,$mask80
-	mov	$rounds,$rounds,lsl#2	@ (rounds-1)*4
-
-.Lmix:	and	$t1,$s0,$mask80
-	and	$s1,$s0,$mask7f
-	sub	$t1,$t1,$t1,lsr#7
-	and	$t1,$t1,$mask1b
-	eor	$s1,$t1,$s1,lsl#1	@ tp2
-
-	and	$t1,$s1,$mask80
-	and	$s2,$s1,$mask7f
-	sub	$t1,$t1,$t1,lsr#7
-	and	$t1,$t1,$mask1b
-	eor	$s2,$t1,$s2,lsl#1	@ tp4
-
-	and	$t1,$s2,$mask80
-	and	$s3,$s2,$mask7f
-	sub	$t1,$t1,$t1,lsr#7
-	and	$t1,$t1,$mask1b
-	eor	$s3,$t1,$s3,lsl#1	@ tp8
-
-	eor	$t1,$s1,$s2
-	eor	$t2,$s0,$s3		@ tp9
-	eor	$t1,$t1,$s3		@ tpe
-	eor	$t1,$t1,$s1,ror#24
-	eor	$t1,$t1,$t2,ror#24	@ ^= ROTATE(tpb=tp9^tp2,8)
-	eor	$t1,$t1,$s2,ror#16
-	eor	$t1,$t1,$t2,ror#16	@ ^= ROTATE(tpd=tp9^tp4,16)
-	eor	$t1,$t1,$t2,ror#8	@ ^= ROTATE(tp9,24)
-
-	ldr	$s0,[$key,#4]		@ prefetch tp1
-	str	$t1,[$key],#4
-	subs	$rounds,$rounds,#1
-	bne	.Lmix
-
-	mov	r0,#0
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r12,pc}
-#else
-	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	AES_set_enc2dec_key,.-AES_set_enc2dec_key
-
-.type	AES_Td,%object
-.align	5
-AES_Td:
-.word	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
-.word	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
-.word	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
-.word	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
-.word	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
-.word	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
-.word	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
-.word	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
-.word	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
-.word	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
-.word	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
-.word	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
-.word	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
-.word	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
-.word	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
-.word	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
-.word	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
-.word	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
-.word	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
-.word	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
-.word	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
-.word	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
-.word	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
-.word	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
-.word	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
-.word	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
-.word	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
-.word	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
-.word	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
-.word	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
-.word	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
-.word	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
-.word	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
-.word	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
-.word	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
-.word	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
-.word	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
-.word	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
-.word	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
-.word	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
-.word	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
-.word	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
-.word	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
-.word	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
-.word	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
-.word	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
-.word	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
-.word	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
-.word	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
-.word	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
-.word	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
-.word	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
-.word	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
-.word	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
-.word	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
-.word	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
-.word	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
-.word	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
-.word	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
-.word	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
-.word	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
-.word	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
-.word	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
-.word	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
-@ Td4[256]
-.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
-.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
-.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
-.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
-.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
-.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
-.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
-.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
-.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
-.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
-.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
-.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
-.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
-.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
-.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
-.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
-.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
-.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
-.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
-.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
-.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
-.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
-.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
-.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
-.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
-.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
-.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
-.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
-.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
-.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
-.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
-.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-.size	AES_Td,.-AES_Td
-
-@ void aes_nohw_decrypt(const unsigned char *in, unsigned char *out,
-@ 		                  const AES_KEY *key) {
-.global aes_nohw_decrypt
-.type   aes_nohw_decrypt,%function
-.align	5
-aes_nohw_decrypt:
-#ifndef	__thumb2__
-	sub	r3,pc,#8		@ aes_nohw_decrypt
-#else
-	adr	r3,.
-#endif
-	stmdb   sp!,{r1,r4-r12,lr}
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	$tbl,AES_Td
-#else
-	sub	$tbl,r3,#aes_nohw_decrypt-AES_Td	@ Td
-#endif
-	mov	$rounds,r0		@ inp
-	mov	$key,r2
-#if __ARM_ARCH__<7
-	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
-	ldrb	$t1,[$rounds,#2]	@ manner...
-	ldrb	$t2,[$rounds,#1]
-	ldrb	$t3,[$rounds,#0]
-	orr	$s0,$s0,$t1,lsl#8
-	ldrb	$s1,[$rounds,#7]
-	orr	$s0,$s0,$t2,lsl#16
-	ldrb	$t1,[$rounds,#6]
-	orr	$s0,$s0,$t3,lsl#24
-	ldrb	$t2,[$rounds,#5]
-	ldrb	$t3,[$rounds,#4]
-	orr	$s1,$s1,$t1,lsl#8
-	ldrb	$s2,[$rounds,#11]
-	orr	$s1,$s1,$t2,lsl#16
-	ldrb	$t1,[$rounds,#10]
-	orr	$s1,$s1,$t3,lsl#24
-	ldrb	$t2,[$rounds,#9]
-	ldrb	$t3,[$rounds,#8]
-	orr	$s2,$s2,$t1,lsl#8
-	ldrb	$s3,[$rounds,#15]
-	orr	$s2,$s2,$t2,lsl#16
-	ldrb	$t1,[$rounds,#14]
-	orr	$s2,$s2,$t3,lsl#24
-	ldrb	$t2,[$rounds,#13]
-	ldrb	$t3,[$rounds,#12]
-	orr	$s3,$s3,$t1,lsl#8
-	orr	$s3,$s3,$t2,lsl#16
-	orr	$s3,$s3,$t3,lsl#24
-#else
-	ldr	$s0,[$rounds,#0]
-	ldr	$s1,[$rounds,#4]
-	ldr	$s2,[$rounds,#8]
-	ldr	$s3,[$rounds,#12]
-#ifdef __ARMEL__
-	rev	$s0,$s0
-	rev	$s1,$s1
-	rev	$s2,$s2
-	rev	$s3,$s3
-#endif
-#endif
-	bl	_armv4_AES_decrypt
-
-	ldr	$rounds,[sp],#4		@ pop out
-#if __ARM_ARCH__>=7
-#ifdef __ARMEL__
-	rev	$s0,$s0
-	rev	$s1,$s1
-	rev	$s2,$s2
-	rev	$s3,$s3
-#endif
-	str	$s0,[$rounds,#0]
-	str	$s1,[$rounds,#4]
-	str	$s2,[$rounds,#8]
-	str	$s3,[$rounds,#12]
-#else
-	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
-	mov	$t2,$s0,lsr#16		@ manner...
-	mov	$t3,$s0,lsr#8
-	strb	$t1,[$rounds,#0]
-	strb	$t2,[$rounds,#1]
-	mov	$t1,$s1,lsr#24
-	strb	$t3,[$rounds,#2]
-	mov	$t2,$s1,lsr#16
-	strb	$s0,[$rounds,#3]
-	mov	$t3,$s1,lsr#8
-	strb	$t1,[$rounds,#4]
-	strb	$t2,[$rounds,#5]
-	mov	$t1,$s2,lsr#24
-	strb	$t3,[$rounds,#6]
-	mov	$t2,$s2,lsr#16
-	strb	$s1,[$rounds,#7]
-	mov	$t3,$s2,lsr#8
-	strb	$t1,[$rounds,#8]
-	strb	$t2,[$rounds,#9]
-	mov	$t1,$s3,lsr#24
-	strb	$t3,[$rounds,#10]
-	mov	$t2,$s3,lsr#16
-	strb	$s2,[$rounds,#11]
-	mov	$t3,$s3,lsr#8
-	strb	$t1,[$rounds,#12]
-	strb	$t2,[$rounds,#13]
-	strb	$t3,[$rounds,#14]
-	strb	$s3,[$rounds,#15]
-#endif
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r12,pc}
-#else
-	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	aes_nohw_decrypt,.-aes_nohw_decrypt
-
-.type   _armv4_AES_decrypt,%function
-.align	2
-_armv4_AES_decrypt:
-	str	lr,[sp,#-4]!		@ push lr
-	ldmia	$key!,{$t1-$i1}
-	eor	$s0,$s0,$t1
-	ldr	$rounds,[$key,#240-16]
-	eor	$s1,$s1,$t2
-	eor	$s2,$s2,$t3
-	eor	$s3,$s3,$i1
-	sub	$rounds,$rounds,#1
-	mov	lr,#255
-
-	and	$i1,lr,$s0,lsr#16
-	and	$i2,lr,$s0,lsr#8
-	and	$i3,lr,$s0
-	mov	$s0,$s0,lsr#24
-.Ldec_loop:
-	ldr	$t1,[$tbl,$i1,lsl#2]	@ Td1[s0>>16]
-	and	$i1,lr,$s1		@ i0
-	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
-	and	$i2,lr,$s1,lsr#16
-	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]
-	and	$i3,lr,$s1,lsr#8
-	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
-	mov	$s1,$s1,lsr#24
-
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td3[s1>>0]
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td1[s1>>16]
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td2[s1>>8]
-	eor	$s0,$s0,$i1,ror#24
-	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
-	and	$i1,lr,$s2,lsr#8	@ i0
-	eor	$t2,$i2,$t2,ror#8
-	and	$i2,lr,$s2		@ i1
-	eor	$t3,$i3,$t3,ror#8
-	and	$i3,lr,$s2,lsr#16
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
-	eor	$s1,$s1,$t1,ror#8
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
-	mov	$s2,$s2,lsr#24
-
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
-	eor	$s0,$s0,$i1,ror#16
-	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
-	and	$i1,lr,$s3,lsr#16	@ i0
-	eor	$s1,$s1,$i2,ror#24
-	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$i3,$t3,ror#8
-	and	$i3,lr,$s3		@ i2
-	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
-	eor	$s2,$s2,$t2,ror#8
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
-	mov	$s3,$s3,lsr#24
-
-	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
-	eor	$s0,$s0,$i1,ror#8
-	ldr	$i1,[$key],#16
-	eor	$s1,$s1,$i2,ror#16
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
-	eor	$s2,$s2,$i3,ror#24
-
-	ldr	$t1,[$key,#-12]
-	eor	$s0,$s0,$i1
-	ldr	$t2,[$key,#-8]
-	eor	$s3,$s3,$t3,ror#8
-	ldr	$t3,[$key,#-4]
-	and	$i1,lr,$s0,lsr#16
-	eor	$s1,$s1,$t1
-	and	$i2,lr,$s0,lsr#8
-	eor	$s2,$s2,$t2
-	and	$i3,lr,$s0
-	eor	$s3,$s3,$t3
-	mov	$s0,$s0,lsr#24
-
-	subs	$rounds,$rounds,#1
-	bne	.Ldec_loop
-
-	add	$tbl,$tbl,#1024
-
-	ldr	$t2,[$tbl,#0]		@ prefetch Td4
-	ldr	$t3,[$tbl,#32]
-	ldr	$t1,[$tbl,#64]
-	ldr	$t2,[$tbl,#96]
-	ldr	$t3,[$tbl,#128]
-	ldr	$t1,[$tbl,#160]
-	ldr	$t2,[$tbl,#192]
-	ldr	$t3,[$tbl,#224]
-
-	ldrb	$s0,[$tbl,$s0]		@ Td4[s0>>24]
-	ldrb	$t1,[$tbl,$i1]		@ Td4[s0>>16]
-	and	$i1,lr,$s1		@ i0
-	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
-	and	$i2,lr,$s1,lsr#16
-	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
-	and	$i3,lr,$s1,lsr#8
-
-	add	$s1,$tbl,$s1,lsr#24
-	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0]
-	ldrb	$s1,[$s1]		@ Td4[s1>>24]
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]
-	eor	$s0,$i1,$s0,lsl#24
-	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
-	eor	$s1,$t1,$s1,lsl#8
-	and	$i1,lr,$s2,lsr#8	@ i0
-	eor	$t2,$t2,$i2,lsl#8
-	and	$i2,lr,$s2		@ i1
-	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
-	eor	$t3,$t3,$i3,lsl#8
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
-	and	$i3,lr,$s2,lsr#16
-
-	add	$s2,$tbl,$s2,lsr#24
-	ldrb	$s2,[$s2]		@ Td4[s2>>24]
-	eor	$s0,$s0,$i1,lsl#8
-	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
-	eor	$s1,$i2,$s1,lsl#16
-	and	$i1,lr,$s3,lsr#16	@ i0
-	eor	$s2,$t2,$s2,lsl#16
-	and	$i2,lr,$s3,lsr#8	@ i1
-	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
-	eor	$t3,$t3,$i3,lsl#16
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
-	and	$i3,lr,$s3		@ i2
-
-	add	$s3,$tbl,$s3,lsr#24
-	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
-	ldrb	$s3,[$s3]		@ Td4[s3>>24]
-	eor	$s0,$s0,$i1,lsl#16
-	ldr	$i1,[$key,#0]
-	eor	$s1,$s1,$i2,lsl#8
-	ldr	$t1,[$key,#4]
-	eor	$s2,$i3,$s2,lsl#8
-	ldr	$t2,[$key,#8]
-	eor	$s3,$t3,$s3,lsl#24
-	ldr	$t3,[$key,#12]
-
-	eor	$s0,$s0,$i1
-	eor	$s1,$s1,$t1
-	eor	$s2,$s2,$t2
-	eor	$s3,$s3,$t3
-
-	sub	$tbl,$tbl,#1024
-	ldr	pc,[sp],#4		@ pop and return
-.size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
-.asciz	"AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-___
-
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx\tlr/gm;
-
-open SELF,$0;
-while(<SELF>) {
-	next if (/^#!/);
-	last if (!s/^#/@/ and !/^$/);
-	print;
-}
-close SELF;
-
-print $code;
-close STDOUT or die "error closing STDOUT";	# enforce flush
diff --git a/crypto/fipsmodule/aes/asm/aes-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-x86_64.pl
deleted file mode 100755
index 5b95785..0000000
--- a/crypto/fipsmodule/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2909 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 2.1.
-#
-# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
-# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
-# [you'll notice a lot of resemblance], such as compressed S-boxes
-# in little-endian byte order, prefetch of these tables in CBC mode,
-# as well as avoiding L1 cache aliasing between stack frame and key
-# schedule and already mentioned tables, compressed Td4...
-#
-# Performance in number of cycles per processed byte for 128-bit key:
-#
-#		ECB encrypt	ECB decrypt	CBC large chunk
-# AMD64		33		43		13.0
-# EM64T		38		56		18.6(*)
-# Core 2	30		42		14.5(*)
-# Atom		65		86		32.1(*)
-#
-# (*) with hyper-threading off
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-$verticalspin=1;	# unlike 32-bit version $verticalspin performs
-			# ~15% better on both AMD and Intel cores
-$speed_limit=512;	# see aes-586.pl for details
-
-$code=".text\n";
-
-$s0="%eax";
-$s1="%ebx";
-$s2="%ecx";
-$s3="%edx";
-$acc0="%esi";	$mask80="%rsi";
-$acc1="%edi";	$maskfe="%rdi";
-$acc2="%ebp";	$mask1b="%rbp";
-$inp="%r8";
-$out="%r9";
-$t0="%r10d";
-$t1="%r11d";
-$t2="%r12d";
-$rnds="%r13d";
-$sbox="%r14";
-$key="%r15";
-
-sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
-sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
-			$r =~ s/%[er]([sd]i)/%\1l/;
-			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
-sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
-			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
-sub _data_word()
-{ my $i;
-    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
-}
-sub data_word()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".long\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
-    $code.=sprintf"0x%08x\n",$last;
-}
-
-sub data_byte()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".byte\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
-    $code.=sprintf"0x%02x\n",$last&0xff;
-}
-
-sub encvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	shr	\$16,$s2
-	movzb	`&hi("$s0")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	mov	12($key),$s3
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	mov	0($key),$s0
-	xor	1($sbox,$acc1,8),$t2
-	xor	1($sbox,$acc2,8),$t3
-
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enclastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t0
-	movzb	2($sbox,$acc1,8),$t1
-	movzb	2($sbox,$acc2,8),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t3
-	mov	0($sbox,$acc1,8),$acc1	#$t0
-	mov	0($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x0000ff00,$acc1
-	and	\$0x0000ff00,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	mov	0($sbox,$acc0,8),$acc0	#$t2
-	mov	0($sbox,$acc1,8),$acc1	#$t3
-
-	and	\$0x0000ff00,$acc0
-	and	\$0x0000ff00,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t0
-	mov	0($sbox,$acc1,8),$acc1	#$t1
-	mov	0($sbox,$acc2,8),$acc2	#$t2
-
-	and	\$0x00ff0000,$acc0
-	and	\$0x00ff0000,$acc1
-	and	\$0x00ff0000,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t3
-	mov	2($sbox,$acc1,8),$acc1	#$t0
-	mov	2($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x00ff0000,$acc0
-	and	\$0xff000000,$acc1
-	and	\$0xff000000,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	mov	16+12($key),$s3
-	mov	2($sbox,$acc0,8),$acc0	#$t2
-	mov	2($sbox,$acc1,8),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	and	\$0xff000000,$acc0
-	and	\$0xff000000,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub encstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	lea	16($key),$key\n"	if ($i==0);
-
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	mov	0($sbox,$out,8),$out\n";
-
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-	$code.="	xor	4*$i($key),$out\n";
-
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub enclast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-
-	$code.="	mov	2($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-	$code.="	and	\$0x000000ff,$out\n";
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
-	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
-	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
-
-	$code.="	and	\$0x0000ff00,$tmp0\n";
-	$code.="	and	\$0x00ff0000,$tmp1\n";
-	$code.="	and	\$0xff000000,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Lenc_loop
-.align	16
-.Lenc_loop:
-___
-	if ($verticalspin) { &encvert(); }
-	else {	&encstep(0,$s0,$s1,$s2,$s3);
-		&encstep(1,$s1,$s2,$s3,$s0);
-		&encstep(2,$s2,$s3,$s0,$s1);
-		&encstep(3,$s3,$s0,$s1,$s2);
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Lenc_loop
-___
-	if ($verticalspin) { &enclastvert(); }
-	else {	&enclast(0,$s0,$s1,$s2,$s3);
-		&enclast(1,$s1,$s2,$s3,$s0);
-		&enclast(2,$s2,$s3,$s0,$s1);
-		&enclast(3,$s3,$s0,$s1,$s2);
-		$code.=<<___;
-		xor	16+0($key),$s0		# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
-___
-
-# it's possible to implement this by shifting tN by 8, filling least
-# significant byte with byte load and finally bswap-ing at the end,
-# but such partial register load kills Core 2...
-sub enccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s2
-	movzb	`&hi("$s3")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s0")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	`&lo("$s2")`,$acc1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shl	\$8,$t4
-	shr	\$16,$s3
-	shl	\$8,$t5
-	xor	$t4,$t0
-	shr	\$16,$s0
-	movzb	`&lo("$s3")`,$t4
-	shr	\$16,$s1
-	xor	$t5,$t1
-	shl	\$8,$acc2
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-
-	shl	\$8,$acc0
-	movzb	`&lo("$s1")`,$acc2
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s3")`,$acc0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	xor	$acc1,$t0
-
-	shr	\$8,$s2
-	movzb	`&hi("$s0")`,$acc1
-	shl	\$16,$t4
-	shr	\$8,$s1
-	shl	\$16,$t5
-	xor	$t4,$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$s2,1),$s3	#$t3
-	movzb	($sbox,$s1,1),$s2	#$t2
-
-	shl	\$16,$acc2
-	xor	$t5,$t2
-	shl	\$24,$acc0
-	xor	$acc2,$t3
-	shl	\$24,$acc1
-	xor	$acc0,$t0
-	shl	\$24,$s3
-	xor	$acc1,$t1
-	shl	\$24,$s2
-	mov	$t0,$s0
-	mov	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enctransform_ref()
-{ my $sn = shift;
-  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	mov	$sn,$acc
-	and	\$0x80808080,$acc
-	mov	$acc,$tmp
-	shr	\$7,$tmp
-	lea	($sn,$sn),$r2
-	sub	$tmp,$acc
-	and	\$0xfefefefe,$r2
-	and	\$0x1b1b1b1b,$acc
-	mov	$sn,$tmp
-	xor	$acc,$r2
-
-	xor	$r2,$sn
-	rol	\$24,$sn
-	xor	$r2,$sn
-	ror	\$16,$tmp
-	xor	$tmp,$sn
-	ror	\$8,$tmp
-	xor	$tmp,$sn
-___
-}
-
-# unlike decrypt case it does not pay off to parallelize enctransform
-sub enctransform()
-{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
-
-$code.=<<___;
-	mov	\$0x80808080,$t0
-	mov	\$0x80808080,$t1
-	and	$s0,$t0
-	and	$s1,$t1
-	mov	$t0,$acc0
-	mov	$t1,$acc1
-	shr	\$7,$t0
-	lea	($s0,$s0),$r20
-	shr	\$7,$t1
-	lea	($s1,$s1),$r21
-	sub	$t0,$acc0
-	sub	$t1,$acc1
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s0,$t0
-	mov	$s1,$t1
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	\$0x80808080,$t2
-	rol	\$24,$s0
-	 mov	\$0x80808080,$t3
-	rol	\$24,$s1
-	 and	$s2,$t2
-	 and	$s3,$t3
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	$t2,$acc0
-	ror	\$16,$t0
-	 mov	$t3,$acc1
-	ror	\$16,$t1
-	 lea	($s2,$s2),$r20
-	 shr	\$7,$t2
-	xor	$t0,$s0
-	 shr	\$7,$t3
-	xor	$t1,$s1
-	ror	\$8,$t0
-	 lea	($s3,$s3),$r21
-	ror	\$8,$t1
-	 sub	$t2,$acc0
-	 sub	$t3,$acc1
-	xor	$t0,$s0
-	xor	$t1,$s1
-
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s2,$t2
-	mov	$s3,$t3
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	ror	\$16,$t2
-	xor	$r20,$s2
-	ror	\$16,$t3
-	xor	$r21,$s3
-	rol	\$24,$s2
-	mov	0($sbox),$acc0			# prefetch Te4
-	rol	\$24,$s3
-	xor	$r20,$s2
-	mov	64($sbox),$acc1
-	xor	$r21,$s3
-	mov	128($sbox),$r20
-	xor	$t2,$s2
-	ror	\$8,$t2
-	xor	$t3,$s3
-	ror	\$8,$t3
-	xor	$t2,$s2
-	mov	192($sbox),$r21
-	xor	$t3,$s3
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Te4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&enccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Lenc_compact_done
-___
-		&enctransform();
-$code.=<<___;
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
-___
-
-# void aes_nohw_encrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.align	16
-.globl	aes_nohw_encrypt
-.type	aes_nohw_encrypt,\@function,3
-.hidden	aes_nohw_encrypt
-aes_nohw_encrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Lenc_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Te+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-
-	call	_x86_64_AES_encrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Lenc_epilogue:
-	ret
-.cfi_endproc
-.size	aes_nohw_encrypt,.-aes_nohw_encrypt
-___
-
-#------------------------------------------------------------------#
-
-sub decvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s1")`,$acc0
-	shr	\$16,$s0
-	movzb	`&hi("$s2")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	mov	12($key),$s3
-	movzb	`&hi("$s0")`,$acc2
-	xor	1($sbox,$acc0,8),$t2
-	mov	0($key),$s0
-	xor	1($sbox,$acc2,8),$t3
-
-	xor	$t0,$s0
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t2,$s2
-	xor	$t1,$s1
-	xor	$t3,$s3
-___
-}
-
-sub declastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	lea	2048($sbox),$sbox	# size optimization
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$t0
-	movzb	($sbox,$acc1,1),$t1
-	movzb	($sbox,$acc2,1),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$8,$acc1
-	shl	\$8,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s3
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s0
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-
-	shl	\$8,$acc0
-	shl	\$8,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-
-	shl	\$16,$acc0
-	shl	\$16,$acc1
-	shl	\$16,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$16,$acc0
-	shl	\$24,$acc1
-	shl	\$24,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	mov	16+12($key),$s3
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	shl	\$24,$acc0
-	shl	\$24,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	lea	-2048($sbox),$sbox
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub decstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	mov	0($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub declast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	movzb	2048($sbox,$out,1),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
-	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
-	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
-
-	$code.="	shl	\$8,$tmp0\n";
-	$code.="	shl	\$16,$tmp1\n";
-	$code.="	shl	\$24,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Ldec_loop
-.align	16
-.Ldec_loop:
-___
-	if ($verticalspin) { &decvert(); }
-	else {	&decstep(0,$s0,$s3,$s2,$s1);
-		&decstep(1,$s1,$s0,$s3,$s2);
-		&decstep(2,$s2,$s1,$s0,$s3);
-		&decstep(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		lea	16($key),$key
-		xor	0($key),$s0			# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-___
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Ldec_loop
-___
-	if ($verticalspin) { &declastvert(); }
-	else {	&declast(0,$s0,$s3,$s2,$s1);
-		&declast(1,$s1,$s0,$s3,$s2);
-		&declast(2,$s2,$s1,$s0,$s3);
-		&declast(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		xor	16+0($key),$s0			# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
-___
-
-sub deccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	movzb	`&hi("$s1")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s2")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shr	\$16,$s2
-	shl	\$8,$t5
-	shl	\$8,$t4
-	movzb	`&lo("$s2")`,$acc1
-	shr	\$16,$s0
-	xor	$t4,$t0
-	shr	\$16,$s1
-	movzb	`&lo("$s3")`,$t4
-
-	shl	\$8,$acc2
-	xor	$t5,$t1
-	shl	\$8,$acc0
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-	movzb	`&lo("$s1")`,$acc2
-
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s1")`,$acc0
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	xor	$acc1,$t0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	movzb	`&hi("$s2")`,$acc1
-
-	shl	\$16,$acc2
-	shl	\$16,$t4
-	shl	\$16,$t5
-	xor	$acc2,$t3
-	movzb	`&hi("$s3")`,$acc2
-	xor	$t4,$t1
-	shr	\$8,$s0
-	xor	$t5,$t2
-
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$s1	#$t1
-	movzb	($sbox,$acc2,1),$s2	#$t2
-	movzb	($sbox,$s0,1),$s3	#$t3
-
-	mov	$t0,$s0
-	shl	\$24,$acc0
-	shl	\$24,$s1
-	shl	\$24,$s2
-	xor	$acc0,$s0
-	shl	\$24,$s3
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-# parallelized version! input is pair of 64-bit values: %rax=s1.s0
-# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
-# %ecx=s2 and %edx=s3.
-sub dectransform()
-{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
-  my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
-  my $prefetch = shift;
-
-$code.=<<___;
-	mov	$mask80,$tp40
-	mov	$mask80,$tp48
-	and	$tp10,$tp40
-	and	$tp18,$tp48
-	mov	$tp40,$acc0
-	mov	$tp48,$acc8
-	shr	\$7,$tp40
-	lea	($tp10,$tp10),$tp20
-	shr	\$7,$tp48
-	lea	($tp18,$tp18),$tp28
-	sub	$tp40,$acc0
-	sub	$tp48,$acc8
-	and	$maskfe,$tp20
-	and	$maskfe,$tp28
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp20
-	xor	$acc8,$tp28
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp20,$tp80
-	and	$tp28,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	lea	($tp20,$tp20),$tp40
-	shr	\$7,$tp88
-	lea	($tp28,$tp28),$tp48
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	and	$maskfe,$tp40
-	and	$maskfe,$tp48
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp40
-	xor	$acc8,$tp48
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp40,$tp80
-	and	$tp48,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	 xor	$tp10,$tp20		# tp2^=tp1
-	shr	\$7,$tp88
-	 xor	$tp18,$tp28		# tp2^=tp1
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	lea	($tp40,$tp40),$tp80
-	lea	($tp48,$tp48),$tp88
-	 xor	$tp10,$tp40		# tp4^=tp1
-	 xor	$tp18,$tp48		# tp4^=tp1
-	and	$maskfe,$tp80
-	and	$maskfe,$tp88
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp80
-	xor	$acc8,$tp88
-
-	xor	$tp80,$tp10		# tp1^=tp8
-	xor	$tp88,$tp18		# tp1^=tp8
-	xor	$tp80,$tp20		# tp2^tp1^=tp8
-	xor	$tp88,$tp28		# tp2^tp1^=tp8
-	mov	$tp10,$acc0
-	mov	$tp18,$acc8
-	xor	$tp80,$tp40		# tp4^tp1^=tp8
-	shr	\$32,$acc0
-	xor	$tp88,$tp48		# tp4^tp1^=tp8
-	shr	\$32,$acc8
-	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
-	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
-	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
-	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-
-	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
-	xor	`&LO("$tp80")`,`&LO("$tp10")`
-	shr	\$32,$tp80
-	xor	`&LO("$tp88")`,`&LO("$tp18")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	mov	$tp20,$tp80
-	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp28,$tp88
-	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
-	shr	\$32,$tp80
-	xor	`&LO("$tp20")`,`&LO("$tp10")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp28")`,`&LO("$tp18")`
-	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp40,$tp20
-	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp48,$tp28
-	shr	\$32,$tp20
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	shr	\$32,$tp28
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	`"mov	0($sbox),$mask80"	if ($prefetch)`
-	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	64($sbox),$maskfe"	if ($prefetch)`
-	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	128($sbox),$mask1b"	if ($prefetch)`
-	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	192($sbox),$tp80"	if ($prefetch)`
-	xor	`&LO("$tp40")`,`&LO("$tp10")`
-	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
-	xor	`&LO("$tp48")`,`&LO("$tp18")`
-	`"mov	256($sbox),$tp88"	if ($prefetch)`
-	xor	`&LO("$tp20")`,`&LO("$acc0")`
-	xor	`&LO("$tp28")`,`&LO("$acc8")`
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Td4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Ldec_loop_compact
-
-.align	16
-.Ldec_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&deccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Ldec_compact_done
-
-		mov	256+0($sbox),$mask80
-		shl	\$32,%rbx
-		shl	\$32,%rdx
-		mov	256+8($sbox),$maskfe
-		or	%rbx,%rax
-		or	%rdx,%rcx
-		mov	256+16($sbox),$mask1b
-___
-		&dectransform(1);
-$code.=<<___;
-	jmp	.Ldec_loop_compact
-.align	16
-.Ldec_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
-___
-
-# void aes_nohw_decrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.align	16
-.globl	aes_nohw_decrypt
-.type	aes_nohw_decrypt,\@function,3
-.hidden	aes_nohw_decrypt
-aes_nohw_decrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Ldec_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Td4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Td+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-	shr	\$3,%rbp	# recall "magic" constants!
-	add	%rbp,$sbox
-
-	call	_x86_64_AES_decrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Ldec_epilogue:
-	ret
-.cfi_endproc
-.size	aes_nohw_decrypt,.-aes_nohw_decrypt
-___
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-$code.=<<___;
-	movz	%dl,%esi		# rk[i]>>0
-	movzb	-128(%rbp,%rsi),%ebx
-	movz	%dh,%esi		# rk[i]>>8
-	shl	\$24,%ebx
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	shr	\$16,%edx
-	movz	%dl,%esi		# rk[i]>>16
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	movz	%dh,%esi		# rk[i]>>24
-	shl	\$8,%ebx
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	shl	\$16,%ebx
-	xor	%ebx,%eax
-
-	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
-___
-}
-
-# int aes_nohw_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
-$code.=<<___;
-.align	16
-.globl aes_nohw_set_encrypt_key
-.type  aes_nohw_set_encrypt_key,\@function,3
-aes_nohw_set_encrypt_key:
-.cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12			# redundant, but allows to share
-.cfi_push	%r12
-	push	%r13			# exception handler...
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8,%rsp
-.cfi_adjust_cfa_offset	8
-.Lenc_key_prologue:
-
-	call	_x86_64_AES_set_encrypt_key
-
-	mov	40(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	48(%rsp),%rbx
-.cfi_restore	%rbx
-	add	\$56,%rsp
-.cfi_adjust_cfa_offset	-56
-.Lenc_key_epilogue:
-	ret
-.cfi_endproc
-.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key
-
-.type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
-.align	16
-_x86_64_AES_set_encrypt_key:
-.cfi_startproc
-	mov	%esi,%ecx			# %ecx=bits
-	mov	%rdi,%rsi			# %rsi=userKey
-	mov	%rdx,%rdi			# %rdi=key
-
-	test	\$-1,%rsi
-	jz	.Lbadpointer
-	test	\$-1,%rdi
-	jz	.Lbadpointer
-
-	lea	.LAES_Te(%rip),%rbp
-	lea	2048+128(%rbp),%rbp
-
-	# prefetch Te4
-	mov	0-128(%rbp),%eax
-	mov	32-128(%rbp),%ebx
-	mov	64-128(%rbp),%r8d
-	mov	96-128(%rbp),%edx
-	mov	128-128(%rbp),%eax
-	mov	160-128(%rbp),%ebx
-	mov	192-128(%rbp),%r8d
-	mov	224-128(%rbp),%edx
-
-	cmp	\$128,%ecx
-	je	.L10rounds
-	cmp	\$192,%ecx
-	je	.L12rounds
-	cmp	\$256,%ecx
-	je	.L14rounds
-	mov	\$-2,%rax			# invalid number of bits
-	jmp	.Lexit
-
-.L10rounds:
-	mov	0(%rsi),%rax			# copy first 4 dwords
-	mov	8(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rdx,8(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L10shortcut
-.align	4
-.L10loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	12(%rdi),%edx			# rk[3]
-.L10shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,16(%rdi)			# rk[4]
-		xor	4(%rdi),%eax
-		mov	%eax,20(%rdi)			# rk[5]
-		xor	8(%rdi),%eax
-		mov	%eax,24(%rdi)			# rk[6]
-		xor	12(%rdi),%eax
-		mov	%eax,28(%rdi)			# rk[7]
-		add	\$1,%ecx
-		lea	16(%rdi),%rdi
-		cmp	\$10,%ecx
-	jl	.L10loop
-
-	movl	\$10,80(%rdi)			# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.L12rounds:
-	mov	0(%rsi),%rax			# copy first 6 dwords
-	mov	8(%rsi),%rbx
-	mov	16(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rbx,8(%rdi)
-	mov	%rdx,16(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L12shortcut
-.align	4
-.L12loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	20(%rdi),%edx			# rk[5]
-.L12shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,24(%rdi)			# rk[6]
-		xor	4(%rdi),%eax
-		mov	%eax,28(%rdi)			# rk[7]
-		xor	8(%rdi),%eax
-		mov	%eax,32(%rdi)			# rk[8]
-		xor	12(%rdi),%eax
-		mov	%eax,36(%rdi)			# rk[9]
-
-		cmp	\$7,%ecx
-		je	.L12break
-		add	\$1,%ecx
-
-		xor	16(%rdi),%eax
-		mov	%eax,40(%rdi)			# rk[10]
-		xor	20(%rdi),%eax
-		mov	%eax,44(%rdi)			# rk[11]
-
-		lea	24(%rdi),%rdi
-	jmp	.L12loop
-.L12break:
-	movl	\$12,72(%rdi)		# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.L14rounds:
-	mov	0(%rsi),%rax			# copy first 8 dwords
-	mov	8(%rsi),%rbx
-	mov	16(%rsi),%rcx
-	mov	24(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rbx,8(%rdi)
-	mov	%rcx,16(%rdi)
-	mov	%rdx,24(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L14shortcut
-.align	4
-.L14loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	28(%rdi),%edx			# rk[4]
-.L14shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,32(%rdi)			# rk[8]
-		xor	4(%rdi),%eax
-		mov	%eax,36(%rdi)			# rk[9]
-		xor	8(%rdi),%eax
-		mov	%eax,40(%rdi)			# rk[10]
-		xor	12(%rdi),%eax
-		mov	%eax,44(%rdi)			# rk[11]
-
-		cmp	\$6,%ecx
-		je	.L14break
-		add	\$1,%ecx
-
-		mov	%eax,%edx
-		mov	16(%rdi),%eax			# rk[4]
-		movz	%dl,%esi			# rk[11]>>0
-		movzb	-128(%rbp,%rsi),%ebx
-		movz	%dh,%esi			# rk[11]>>8
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		shr	\$16,%edx
-		shl	\$8,%ebx
-		movz	%dl,%esi			# rk[11]>>16
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		movz	%dh,%esi			# rk[11]>>24
-		shl	\$16,%ebx
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		shl	\$24,%ebx
-		xor	%ebx,%eax
-
-		mov	%eax,48(%rdi)			# rk[12]
-		xor	20(%rdi),%eax
-		mov	%eax,52(%rdi)			# rk[13]
-		xor	24(%rdi),%eax
-		mov	%eax,56(%rdi)			# rk[14]
-		xor	28(%rdi),%eax
-		mov	%eax,60(%rdi)			# rk[15]
-
-		lea	32(%rdi),%rdi
-	jmp	.L14loop
-.L14break:
-	movl	\$14,48(%rdi)		# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.Lbadpointer:
-	mov	\$-1,%rax
-.Lexit:
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
-___
-
-sub deckey_ref()
-{ my ($i,$ptr,$te,$td) = @_;
-  my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
-$code.=<<___;
-	mov	$i($ptr),$tp1
-	mov	$tp1,$acc
-	and	\$0x80808080,$acc
-	mov	$acc,$tp4
-	shr	\$7,$tp4
-	lea	0($tp1,$tp1),$tp2
-	sub	$tp4,$acc
-	and	\$0xfefefefe,$tp2
-	and	\$0x1b1b1b1b,$acc
-	xor	$tp2,$acc
-	mov	$acc,$tp2
-
-	and	\$0x80808080,$acc
-	mov	$acc,$tp8
-	shr	\$7,$tp8
-	lea	0($tp2,$tp2),$tp4
-	sub	$tp8,$acc
-	and	\$0xfefefefe,$tp4
-	and	\$0x1b1b1b1b,$acc
-	 xor	$tp1,$tp2		# tp2^tp1
-	xor	$tp4,$acc
-	mov	$acc,$tp4
-
-	and	\$0x80808080,$acc
-	mov	$acc,$tp8
-	shr	\$7,$tp8
-	sub	$tp8,$acc
-	lea	0($tp4,$tp4),$tp8
-	 xor	$tp1,$tp4		# tp4^tp1
-	and	\$0xfefefefe,$tp8
-	and	\$0x1b1b1b1b,$acc
-	xor	$acc,$tp8
-
-	xor	$tp8,$tp1		# tp1^tp8
-	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
-	xor	$tp8,$tp2		# tp2^tp1^tp8
-	xor	$tp8,$tp4		# tp4^tp1^tp8
-	xor	$tp2,$tp8
-	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
-
-	xor	$tp8,$tp1
-	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
-	xor	$tp2,$tp1
-	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
-	xor	$tp4,$tp1
-
-	mov	$tp1,$i($ptr)
-___
-}
-
-# int aes_nohw_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
-$code.=<<___;
-.align	16
-.globl aes_nohw_set_decrypt_key
-.type  aes_nohw_set_decrypt_key,\@function,3
-aes_nohw_set_decrypt_key:
-.cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	%rdx			# save key schedule
-.cfi_adjust_cfa_offset	8
-.Ldec_key_prologue:
-
-	call	_x86_64_AES_set_encrypt_key
-	mov	(%rsp),%r8		# restore key schedule
-	cmp	\$0,%eax
-	jne	.Labort
-
-	mov	240(%r8),%r14d		# pull number of rounds
-	xor	%rdi,%rdi
-	lea	(%rdi,%r14d,4),%rcx
-	mov	%r8,%rsi
-	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
-.align	4
-.Linvert:
-		mov	0(%rsi),%rax
-		mov	8(%rsi),%rbx
-		mov	0(%rdi),%rcx
-		mov	8(%rdi),%rdx
-		mov	%rax,0(%rdi)
-		mov	%rbx,8(%rdi)
-		mov	%rcx,0(%rsi)
-		mov	%rdx,8(%rsi)
-		lea	16(%rsi),%rsi
-		lea	-16(%rdi),%rdi
-		cmp	%rsi,%rdi
-	jne	.Linvert
-
-	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
-
-	mov	40(%rax),$mask80
-	mov	48(%rax),$maskfe
-	mov	56(%rax),$mask1b
-
-	mov	%r8,$key
-	sub	\$1,%r14d
-.align	4
-.Lpermute:
-		lea	16($key),$key
-		mov	0($key),%rax
-		mov	8($key),%rcx
-___
-		&dectransform ();
-$code.=<<___;
-		mov	%eax,0($key)
-		mov	%ebx,4($key)
-		mov	%ecx,8($key)
-		mov	%edx,12($key)
-		sub	\$1,%r14d
-	jnz	.Lpermute
-
-	xor	%rax,%rax
-.Labort:
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	48(%rsp),%rbx
-.cfi_restore	%rbx
-	add	\$56,%rsp
-.cfi_adjust_cfa_offset	-56
-.Ldec_key_epilogue:
-	ret
-.cfi_endproc
-.size	aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key
-___
-
-# void aes_nohw_cbc_encrypt (const void char *inp, unsigned char *out,
-#			                       size_t length, const AES_KEY *key,
-#			                       unsigned char *ivp,const int enc);
-{
-# stack frame layout
-# -8(%rsp)		return address
-my $keyp="0(%rsp)";		# one to pass as $key
-my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
-my $_rsp="16(%rsp)";		# saved %rsp
-my $_inp="24(%rsp)";		# copy of 1st parameter, inp
-my $_out="32(%rsp)";		# copy of 2nd parameter, out
-my $_len="40(%rsp)";		# copy of 3rd parameter, length
-my $_key="48(%rsp)";		# copy of 4th parameter, key
-my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
-my $ivec="64(%rsp)";		# ivec[16]
-my $aes_key="80(%rsp)";		# copy of aes_key
-my $mark="80+240(%rsp)";	# copy of aes_key->rounds
-
-$code.=<<___;
-.align	16
-.globl	aes_nohw_cbc_encrypt
-.type	aes_nohw_cbc_encrypt,\@function,6
-.extern	OPENSSL_ia32cap_P
-.hidden	aes_nohw_cbc_encrypt
-aes_nohw_cbc_encrypt:
-.cfi_startproc
-	cmp	\$0,%rdx	# check length
-	je	.Lcbc_epilogue
-	pushfq
-# This could be .cfi_push 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset	8
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-.Lcbc_prologue:
-
-	cld
-	mov	%r9d,%r9d	# clear upper half of enc
-
-	lea	.LAES_Te(%rip),$sbox
-	lea	.LAES_Td(%rip),%r10
-	cmp	\$0,%r9
-	cmoveq	%r10,$sbox
-
-.cfi_remember_state
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	mov	(%r10), %r10d
-	cmp	\$$speed_limit,%rdx
-	jb	.Lcbc_slow_prologue
-	test	\$15,%rdx
-	jnz	.Lcbc_slow_prologue
-	bt	\$28,%r10d
-	jc	.Lcbc_slow_prologue
-
-	# allocate aligned stack frame...
-	lea	-88-248(%rsp),$key
-	and	\$-64,$key
-
-	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
-	mov	$sbox,%r10
-	lea	2304($sbox),%r11
-	mov	$key,%r12
-	and	\$0xFFF,%r10	# s = $sbox&0xfff
-	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
-	and	\$0xFFF,%r12	# p = %rsp&0xfff
-
-	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
-	jb	.Lcbc_te_break_out
-	sub	%r11,%r12
-	sub	%r12,$key
-	jmp	.Lcbc_te_ok
-.Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
-	sub	%r10,%r12
-	and	\$0xFFF,%r12
-	add	\$320,%r12
-	sub	%r12,$key
-.align	4
-.Lcbc_te_ok:
-
-	xchg	%rsp,$key
-.cfi_def_cfa_register	$key
-	#add	\$8,%rsp	# reserve for return address!
-	mov	$key,$_rsp	# save %rsp
-.cfi_cfa_expression	$_rsp,deref,+64
-.Lcbc_fast_body:
-	mov	%rdi,$_inp	# save copy of inp
-	mov	%rsi,$_out	# save copy of out
-	mov	%rdx,$_len	# save copy of len
-	mov	%rcx,$_key	# save copy of key
-	mov	%r8,$_ivp	# save copy of ivp
-	movl	\$0,$mark	# copy of aes_key->rounds = 0;
-	mov	%r8,%rbp	# rearrange input arguments
-	mov	%r9,%rbx
-	mov	%rsi,$out
-	mov	%rdi,$inp
-	mov	%rcx,$key
-
-	mov	240($key),%eax		# key->rounds
-	# do we copy key schedule to stack?
-	mov	$key,%r10
-	sub	$sbox,%r10
-	and	\$0xfff,%r10
-	cmp	\$2304,%r10
-	jb	.Lcbc_do_ecopy
-	cmp	\$4096-248,%r10
-	jb	.Lcbc_skip_ecopy
-.align	4
-.Lcbc_do_ecopy:
-		mov	$key,%rsi
-		lea	$aes_key,%rdi
-		lea	$aes_key,$key
-		mov	\$240/8,%ecx
-		.long	0x90A548F3	# rep movsq
-		mov	%eax,(%rdi)	# copy aes_key->rounds
-.Lcbc_skip_ecopy:
-	mov	$key,$keyp	# save key pointer
-
-	mov	\$18,%ecx
-.align	4
-.Lcbc_prefetch_te:
-		mov	0($sbox),%r10
-		mov	32($sbox),%r11
-		mov	64($sbox),%r12
-		mov	96($sbox),%r13
-		lea	128($sbox),$sbox
-		sub	\$1,%ecx
-	jnz	.Lcbc_prefetch_te
-	lea	-2304($sbox),$sbox
-
-	cmp	\$0,%rbx
-	je	.LFAST_DECRYPT
-
-#----------------------------- ENCRYPT -----------------------------#
-	mov	0(%rbp),$s0		# load iv
-	mov	4(%rbp),$s1
-	mov	8(%rbp),$s2
-	mov	12(%rbp),$s3
-
-.align	4
-.Lcbc_fast_enc_loop:
-		xor	0($inp),$s0
-		xor	4($inp),$s1
-		xor	8($inp),$s2
-		xor	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_encrypt
-
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10
-		mov	$s0,0($out)
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		sub	\$16,%r10
-		test	\$-16,%r10
-		mov	%r10,$_len
-	jnz	.Lcbc_fast_enc_loop
-	mov	$_ivp,%rbp	# restore ivp
-	mov	$s0,0(%rbp)	# save ivec
-	mov	$s1,4(%rbp)
-	mov	$s2,8(%rbp)
-	mov	$s3,12(%rbp)
-
-	jmp	.Lcbc_fast_cleanup
-
-#----------------------------- DECRYPT -----------------------------#
-.align	16
-.LFAST_DECRYPT:
-	cmp	$inp,$out
-	je	.Lcbc_fast_dec_in_place
-
-	mov	%rbp,$ivec
-.align	4
-.Lcbc_fast_dec_loop:
-		mov	0($inp),$s0	# read input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_decrypt
-
-		mov	$ivec,%rbp	# load ivp
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10	# load len
-		xor	0(%rbp),$s0	# xor iv
-		xor	4(%rbp),$s1
-		xor	8(%rbp),$s2
-		xor	12(%rbp),$s3
-		mov	$inp,%rbp	# current input, next iv
-
-		sub	\$16,%r10
-		mov	%r10,$_len	# update len
-		mov	%rbp,$ivec	# update ivp
-
-		mov	$s0,0($out)	# write output
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-	jnz	.Lcbc_fast_dec_loop
-	mov	$_ivp,%r12		# load user ivp
-	mov	0(%rbp),%r10		# load iv
-	mov	8(%rbp),%r11
-	mov	%r10,0(%r12)		# copy back to user
-	mov	%r11,8(%r12)
-	jmp	.Lcbc_fast_cleanup
-
-.align	16
-.Lcbc_fast_dec_in_place:
-	mov	0(%rbp),%r10		# copy iv to stack
-	mov	8(%rbp),%r11
-	mov	%r10,0+$ivec
-	mov	%r11,8+$ivec
-.align	4
-.Lcbc_fast_dec_in_place_loop:
-		mov	0($inp),$s0	# load input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_decrypt
-
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10
-		xor	0+$ivec,$s0
-		xor	4+$ivec,$s1
-		xor	8+$ivec,$s2
-		xor	12+$ivec,$s3
-
-		mov	0($inp),%r11	# load input
-		mov	8($inp),%r12
-		sub	\$16,%r10
-		jz	.Lcbc_fast_dec_in_place_done
-
-		mov	%r11,0+$ivec	# copy input to iv
-		mov	%r12,8+$ivec
-
-		mov	$s0,0($out)	# save output [zaps input]
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		mov	%r10,$_len
-	jmp	.Lcbc_fast_dec_in_place_loop
-.Lcbc_fast_dec_in_place_done:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)	# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0($out)	# save output [zaps input]
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-.align	4
-.Lcbc_fast_cleanup:
-	cmpl	\$0,$mark	# was the key schedule copied?
-	lea	$aes_key,%rdi
-	je	.Lcbc_exit
-		mov	\$240/8,%ecx
-		xor	%rax,%rax
-		.long	0x90AB48F3	# rep stosq
-
-	jmp	.Lcbc_exit
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-.align	16
-.Lcbc_slow_prologue:
-.cfi_restore_state
-	# allocate aligned stack frame...
-	lea	-88(%rsp),%rbp
-	and	\$-64,%rbp
-	# ... just "above" key schedule
-	lea	-88-63(%rcx),%r10
-	sub	%rbp,%r10
-	neg	%r10
-	and	\$0x3c0,%r10
-	sub	%r10,%rbp
-
-	xchg	%rsp,%rbp
-.cfi_def_cfa_register	%rbp
-	#add	\$8,%rsp	# reserve for return address!
-	mov	%rbp,$_rsp	# save %rsp
-.cfi_cfa_expression	$_rsp,deref,+64
-.Lcbc_slow_body:
-	#mov	%rdi,$_inp	# save copy of inp
-	#mov	%rsi,$_out	# save copy of out
-	#mov	%rdx,$_len	# save copy of len
-	#mov	%rcx,$_key	# save copy of key
-	mov	%r8,$_ivp	# save copy of ivp
-	mov	%r8,%rbp	# rearrange input arguments
-	mov	%r9,%rbx
-	mov	%rsi,$out
-	mov	%rdi,$inp
-	mov	%rcx,$key
-	mov	%rdx,%r10
-
-	mov	240($key),%eax
-	mov	$key,$keyp	# save key pointer
-	shl	\$4,%eax
-	lea	($key,%rax),%rax
-	mov	%rax,$keyend
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	lea	2048($sbox),$sbox
-	lea	768-8(%rsp),%rax
-	sub	$sbox,%rax
-	and	\$0x300,%rax
-	lea	($sbox,%rax),$sbox
-
-	cmp	\$0,%rbx
-	je	.LSLOW_DECRYPT
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-	test	\$-16,%r10		# check upon length
-	mov	0(%rbp),$s0		# load iv
-	mov	4(%rbp),$s1
-	mov	8(%rbp),$s2
-	mov	12(%rbp),$s3
-	jz	.Lcbc_slow_enc_tail	# short input...
-
-.align	4
-.Lcbc_slow_enc_loop:
-		xor	0($inp),$s0
-		xor	4($inp),$s1
-		xor	8($inp),$s2
-		xor	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# save inp
-		mov	$out,$_out	# save out
-		mov	%r10,$_len	# save len
-
-		call	_x86_64_AES_encrypt_compact
-
-		mov	$_inp,$inp	# restore inp
-		mov	$_out,$out	# restore out
-		mov	$_len,%r10	# restore len
-		mov	$s0,0($out)
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		sub	\$16,%r10
-		test	\$-16,%r10
-	jnz	.Lcbc_slow_enc_loop
-	test	\$15,%r10
-	jnz	.Lcbc_slow_enc_tail
-	mov	$_ivp,%rbp	# restore ivp
-	mov	$s0,0(%rbp)	# save ivec
-	mov	$s1,4(%rbp)
-	mov	$s2,8(%rbp)
-	mov	$s3,12(%rbp)
-
-	jmp	.Lcbc_exit
-
-.align	4
-.Lcbc_slow_enc_tail:
-	mov	%rax,%r11
-	mov	%rcx,%r12
-	mov	%r10,%rcx
-	mov	$inp,%rsi
-	mov	$out,%rdi
-	.long	0x9066A4F3		# rep movsb
-	mov	\$16,%rcx		# zero tail
-	sub	%r10,%rcx
-	xor	%rax,%rax
-	.long	0x9066AAF3		# rep stosb
-	mov	$out,$inp		# this is not a mistake!
-	mov	\$16,%r10		# len=16
-	mov	%r11,%rax
-	mov	%r12,%rcx
-	jmp	.Lcbc_slow_enc_loop	# one more spin...
-#--------------------------- SLOW DECRYPT ---------------------------#
-.align	16
-.LSLOW_DECRYPT:
-	shr	\$3,%rax
-	add	%rax,$sbox		# recall "magic" constants!
-
-	mov	0(%rbp),%r11		# copy iv to stack
-	mov	8(%rbp),%r12
-	mov	%r11,0+$ivec
-	mov	%r12,8+$ivec
-
-.align	4
-.Lcbc_slow_dec_loop:
-		mov	0($inp),$s0	# load input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# save inp
-		mov	$out,$_out	# save out
-		mov	%r10,$_len	# save len
-
-		call	_x86_64_AES_decrypt_compact
-
-		mov	$_inp,$inp	# restore inp
-		mov	$_out,$out	# restore out
-		mov	$_len,%r10
-		xor	0+$ivec,$s0
-		xor	4+$ivec,$s1
-		xor	8+$ivec,$s2
-		xor	12+$ivec,$s3
-
-		mov	0($inp),%r11	# load input
-		mov	8($inp),%r12
-		sub	\$16,%r10
-		jc	.Lcbc_slow_dec_partial
-		jz	.Lcbc_slow_dec_done
-
-		mov	%r11,0+$ivec	# copy input to iv
-		mov	%r12,8+$ivec
-
-		mov	$s0,0($out)	# save output [can zap input]
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-	jmp	.Lcbc_slow_dec_loop
-.Lcbc_slow_dec_done:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)		# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0($out)		# save output [can zap input]
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	jmp	.Lcbc_exit
-
-.align	4
-.Lcbc_slow_dec_partial:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)		# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0+$ivec		# save output to stack
-	mov	$s1,4+$ivec
-	mov	$s2,8+$ivec
-	mov	$s3,12+$ivec
-
-	mov	$out,%rdi
-	lea	$ivec,%rsi
-	lea	16(%r10),%rcx
-	.long	0x9066A4F3	# rep movsb
-	jmp	.Lcbc_exit
-
-.align	16
-.Lcbc_exit:
-	mov	$_rsp,%rsi
-.cfi_def_cfa	%rsi,64
-	mov	(%rsi),%r15
-.cfi_restore	%r15
-	mov	8(%rsi),%r14
-.cfi_restore	%r14
-	mov	16(%rsi),%r13
-.cfi_restore	%r13
-	mov	24(%rsi),%r12
-.cfi_restore	%r12
-	mov	32(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	48(%rsi),%rsp
-.cfi_def_cfa	%rsp,16
-.Lcbc_popfq:
-	popfq
-# This could be .cfi_pop 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset	-8
-.Lcbc_epilogue:
-	ret
-.cfi_endproc
-.size	aes_nohw_cbc_encrypt,.-aes_nohw_cbc_encrypt
-___
-}
-
-$code.=<<___;
-.align	64
-.LAES_Te:
-___
-	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-$code.=<<___;
-	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
-	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
-	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
-	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
-___
-$code.=<<___;
-.align	64
-.LAES_Td:
-___
-	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-.asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	64
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	block_se_handler,\@abi-omnipotent
-.align	16
-block_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_block_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_block_prologue
-
-	mov	24(%rax),%rax		# pull saved real stack pointer
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_block_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	jmp	.Lcommon_seh_exit
-.size	block_se_handler,.-block_se_handler
-
-.type	key_se_handler,\@abi-omnipotent
-.align	16
-key_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_key_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_key_prologue
-
-	lea	56(%rax),%rax
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_key_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	jmp	.Lcommon_seh_exit
-.size	key_se_handler,.-key_se_handler
-
-.type	cbc_se_handler,\@abi-omnipotent
-.align	16
-cbc_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	lea	.Lcbc_prologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
-	jb	.Lin_cbc_prologue
-
-	lea	.Lcbc_fast_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
-	jb	.Lin_cbc_frame_setup
-
-	lea	.Lcbc_slow_prologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
-	jb	.Lin_cbc_body
-
-	lea	.Lcbc_slow_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
-	jb	.Lin_cbc_frame_setup
-
-.Lin_cbc_body:
-	mov	152($context),%rax	# pull context->Rsp
-
-	lea	.Lcbc_epilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
-	jae	.Lin_cbc_prologue
-
-	lea	8(%rax),%rax
-
-	lea	.Lcbc_popfq(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
-	jae	.Lin_cbc_prologue
-
-	mov	`16-8`(%rax),%rax	# biased $_rsp
-	lea	56(%rax),%rax
-
-.Lin_cbc_frame_setup:
-	mov	-16(%rax),%rbx
-	mov	-24(%rax),%rbp
-	mov	-32(%rax),%r12
-	mov	-40(%rax),%r13
-	mov	-48(%rax),%r14
-	mov	-56(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_cbc_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-.Lcommon_seh_exit:
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	cbc_se_handler,.-cbc_se_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_aes_nohw_encrypt
-	.rva	.LSEH_end_aes_nohw_encrypt
-	.rva	.LSEH_info_aes_nohw_encrypt
-
-	.rva	.LSEH_begin_aes_nohw_decrypt
-	.rva	.LSEH_end_aes_nohw_decrypt
-	.rva	.LSEH_info_aes_nohw_decrypt
-
-	.rva	.LSEH_begin_aes_nohw_set_encrypt_key
-	.rva	.LSEH_end_aes_nohw_set_encrypt_key
-	.rva	.LSEH_info_aes_nohw_set_encrypt_key
-
-	.rva	.LSEH_begin_aes_nohw_set_decrypt_key
-	.rva	.LSEH_end_aes_nohw_set_decrypt_key
-	.rva	.LSEH_info_aes_nohw_set_decrypt_key
-
-	.rva	.LSEH_begin_aes_nohw_cbc_encrypt
-	.rva	.LSEH_end_aes_nohw_cbc_encrypt
-	.rva	.LSEH_info_aes_nohw_cbc_encrypt
-
-.section	.xdata
-.align	8
-.LSEH_info_aes_nohw_encrypt:
-	.byte	9,0,0,0
-	.rva	block_se_handler
-	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
-.LSEH_info_aes_nohw_decrypt:
-	.byte	9,0,0,0
-	.rva	block_se_handler
-	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_aes_nohw_set_encrypt_key:
-	.byte	9,0,0,0
-	.rva	key_se_handler
-	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_aes_nohw_set_decrypt_key:
-	.byte	9,0,0,0
-	.rva	key_se_handler
-	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
-.LSEH_info_aes_nohw_cbc_encrypt:
-	.byte	9,0,0,0
-	.rva	cbc_se_handler
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT or die "error closing STDOUT";
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 99d509a..5b80695 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -218,19 +218,17 @@
 #endif  // !VPAES
 
 
-void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
-void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
 int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
                              AES_KEY *aeskey);
 int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
                              AES_KEY *aeskey);
-
-#if !defined(OPENSSL_NO_ASM) && \
-    (defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
-#define AES_NOHW_CBC
+void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
+void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
+void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
+                                   size_t blocks, const AES_KEY *key,
+                                   const uint8_t ivec[16]);
 void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                           const AES_KEY *key, uint8_t *ivec, const int enc);
-#endif
 
 
 #if defined(__cplusplus)
diff --git a/crypto/fipsmodule/aes/mode_wrappers.c b/crypto/fipsmodule/aes/mode_wrappers.c
index ae8a91b..206fcfd 100644
--- a/crypto/fipsmodule/aes/mode_wrappers.c
+++ b/crypto/fipsmodule/aes/mode_wrappers.c
@@ -79,12 +79,10 @@
     return;
   }
 
-#if defined(AES_NOHW_CBC)
   if (!vpaes_capable()) {
     aes_nohw_cbc_encrypt(in, out, len, key, ivec, enc);
     return;
   }
-#endif
   if (enc) {
     CRYPTO_cbc128_encrypt(in, out, len, key, ivec, AES_encrypt);
   } else {
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
index 7485f6c..567a0cd 100644
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -31,6 +31,7 @@
 #include "../internal.h"
 
 #include "aes/aes.c"
+#include "aes/aes_nohw.c"
 #include "aes/key_wrap.c"
 #include "aes/mode_wrappers.c"
 #include "bn/add.c"
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index c6dd973..8f4907f 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -173,11 +173,9 @@
       ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
       dat->block = aes_nohw_decrypt;
       dat->stream.cbc = NULL;
-#if defined(AES_NOHW_CBC)
       if (mode == EVP_CIPH_CBC_MODE) {
         dat->stream.cbc = aes_nohw_cbc_encrypt;
       }
-#endif
     }
   } else if (hwaes_capable()) {
     ret = aes_hw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
@@ -209,11 +207,9 @@
     ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
     dat->block = aes_nohw_encrypt;
     dat->stream.cbc = NULL;
-#if defined(AES_NOHW_CBC)
     if (mode == EVP_CIPH_CBC_MODE) {
       dat->stream.cbc = aes_nohw_cbc_encrypt;
     }
-#endif
   }
 
   if (ret < 0) {
@@ -318,7 +314,7 @@
   if (out_block) {
     *out_block = aes_nohw_encrypt;
   }
-  return NULL;
+  return aes_nohw_ctr32_encrypt_blocks;
 }
 
 #if defined(OPENSSL_32_BIT)