Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 463febb..b459263 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -8,6 +8,7 @@
     aesni-x86_64.${ASM_EXT}
     aes-x86_64.${ASM_EXT}
     bsaes-x86_64.${ASM_EXT}
+    ghash-ssse3-x86_64.${ASM_EXT}
     ghash-x86_64.${ASM_EXT}
     md5-x86_64.${ASM_EXT}
     p256-x86_64-asm.${ASM_EXT}
@@ -96,6 +97,7 @@
 perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl)
 perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
 perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
+perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
 perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
 perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl)
 perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index f7c145b..81c74cb 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -46,6 +46,7 @@
  * OF THE POSSIBILITY OF SUCH DAMAGE.
  * ==================================================================== */
 
+#include <assert.h>
 #include <string.h>
 
 #include <openssl/aead.h>
@@ -84,13 +85,13 @@
 } EVP_AES_KEY;
 
 typedef struct {
+  GCM128_CONTEXT gcm;
   union {
     double align;
     AES_KEY ks;
   } ks;         // AES key schedule to use
   int key_set;  // Set if key initialised
   int iv_set;   // Set if an iv is set
-  GCM128_CONTEXT gcm;
   uint8_t *iv;  // Temporary IV store
   int ivlen;         // IV length
   int taglen;
@@ -257,9 +258,37 @@
   return NULL;
 }
 
+#if defined(OPENSSL_32_BIT)
+#define EVP_AES_GCM_CTX_PADDING (4+8)
+#else
+#define EVP_AES_GCM_CTX_PADDING 8
+#endif
+
+static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) {
+#if defined(__GNUC__) || defined(__clang__)
+  OPENSSL_STATIC_ASSERT(
+      alignof(EVP_AES_GCM_CTX) <= 16,
+      "EVP_AES_GCM_CTX needs more alignment than this function provides");
+#endif
+
+  // |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment
+  // on 64-bit systems, so we need to adjust to reach 16-byte alignment.
+  assert(ctx->cipher->ctx_size ==
+         sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING);
+
+  char *ptr = ctx->cipher_data;
+#if defined(OPENSSL_32_BIT)
+  assert((uintptr_t)ptr % 4 == 0);
+  ptr += (uintptr_t)ptr & 4;
+#endif
+  assert((uintptr_t)ptr % 8 == 0);
+  ptr += (uintptr_t)ptr & 8;
+  return (EVP_AES_GCM_CTX *)ptr;
+}
+
 static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
                             const uint8_t *iv, int enc) {
-  EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+  EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
   if (!iv && !key) {
     return 1;
   }
@@ -290,7 +319,7 @@
 }
 
 static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) {
-  EVP_AES_GCM_CTX *gctx = c->cipher_data;
+  EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
   OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
   if (gctx->iv != c->iv) {
     OPENSSL_free(gctx->iv);
@@ -314,7 +343,7 @@
 }
 
 static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {
-  EVP_AES_GCM_CTX *gctx = c->cipher_data;
+  EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
   switch (type) {
     case EVP_CTRL_INIT:
       gctx->key_set = 0;
@@ -406,7 +435,7 @@
 
     case EVP_CTRL_COPY: {
       EVP_CIPHER_CTX *out = ptr;
-      EVP_AES_GCM_CTX *gctx_out = out->cipher_data;
+      EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out);
       if (gctx->iv == c->iv) {
         gctx_out->iv = out->iv;
       } else {
@@ -426,7 +455,7 @@
 
 static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
                           size_t len) {
-  EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+  EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
 
   // If not set up, return error
   if (!gctx->key_set) {
@@ -540,7 +569,7 @@
   out->block_size = 1;
   out->key_len = 16;
   out->iv_len = 12;
-  out->ctx_size = sizeof(EVP_AES_GCM_CTX);
+  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
   out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
                EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
                EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
@@ -608,7 +637,7 @@
   out->block_size = 1;
   out->key_len = 24;
   out->iv_len = 12;
-  out->ctx_size = sizeof(EVP_AES_GCM_CTX);
+  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
   out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
                EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
                EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
@@ -676,7 +705,7 @@
   out->block_size = 1;
   out->key_len = 32;
   out->iv_len = 12;
-  out->ctx_size = sizeof(EVP_AES_GCM_CTX);
+  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
   out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
                EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
                EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
diff --git a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
new file mode 100644
index 0000000..830381b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
@@ -0,0 +1,413 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
+# table-based GHASH implementation. It requires SSSE3 instructions.
+#
+# For background, the table-based strategy is a 4-bit windowed multiplication.
+# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
+# over 4-bit windows of the input and indexes them up into the table. Visually,
+# it multiplies as in the schoolbook multiplication diagram below, but with
+# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
+# it incorporates the terms labeled '1' by indexing the most significant term
+# of X into the table. Then it shifts and repeats for '2' and so on.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        666666
+#       555555
+#      444444
+#     333333
+#    222222
+#   111111
+#
+# This implementation changes the order. We treat the table as a 16×16 matrix
+# and transpose it. The first row is then the first byte of each multiple of H,
+# and so on. We then reorder terms as below. Observe that the terms labeled '1'
+# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
+# pshufb instruction, using alternating terms of X in parallel as indices. This
+# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
+# repeat for each row.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        224466
+#       113355
+#      224466
+#     113355
+#    224466
+#   113355
+#
+# Next we account for GCM's confusing bit order. The "first" bit is the least
+# significant coefficient, but GCM treats the most sigificant bit within a byte
+# as first. Bytes are little-endian, and bits are big-endian. We reverse the
+# bytes in XMM registers for a consistent bit and byte ordering, but this means
+# the least significant bit is the most significant coefficient and vice versa.
+#
+# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
+# bit ordering within the XMM register, rather than the reversed coefficient
+# ordering. Low bits are less significant bits and more significant
+# coefficients. Right-shifts move from MSB to the LSB and correspond to
+# increasing the power of each coefficient.
+#
+# Note this bit reversal enters into the table's column indices. H*1 is stored
+# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
+# table rows contain more significant coefficients, so we iterate forwards.
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
+                                        ("%rdi", "%rsi", "%rdx", "%rcx");
+
+
+my $code = <<____;
+.text
+
+# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
+# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
+# formatted as described above.
+# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+.type	gcm_gmult_ssse3, \@abi-omnipotent
+.globl	gcm_gmult_ssse3
+.align	16
+gcm_gmult_ssse3:
+.cfi_startproc
+.Lgmult_seh_begin:
+____
+$code .= <<____ if ($win64);
+	subq	\$40, %rsp
+.Lgmult_seh_allocstack:
+	movdqa	%xmm6, (%rsp)
+.Lgmult_seh_save_xmm6:
+	movdqa	%xmm10, 16(%rsp)
+.Lgmult_seh_save_xmm10:
+.Lgmult_seh_prolog_end:
+____
+$code .= <<____;
+	movdqu	($Xi), %xmm0
+	movdqa	.Lreverse_bytes(%rip), %xmm10
+	movdqa	.Llow4_mask(%rip), %xmm2
+
+	# Reverse input bytes to deserialize.
+	pshufb	%xmm10, %xmm0
+
+	# Split each byte into low (%xmm0) and high (%xmm1) halves.
+	movdqa	%xmm2, %xmm1
+	pandn	%xmm0, %xmm1
+	psrld	\$4, %xmm1
+	pand	%xmm2, %xmm0
+
+	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
+	# that, due to bit reversal, %xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+____
+
+my $call_counter = 0;
+# process_rows returns assembly code to process $rows rows of the table. On
+# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
+# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
+# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
+# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
+sub process_rows {
+    my ($rows) = @_;
+    $call_counter++;
+
+    # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
+    # and psrlq shifts the two 64-bit halves separately. Each row produces 8
+    # bits of carry, and the reduction needs an additional 7-bit shift. This
+    # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
+    # at a time.
+    die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
+
+    return <<____;
+	movq	\$$rows, %rax
+.Loop_row_$call_counter:
+	movdqa	($Htable), %xmm4
+	leaq	16($Htable), $Htable
+
+	# Right-shift %xmm2 and %xmm3 by 8 bytes.
+	movdqa	%xmm2, %xmm6
+	palignr	\$1, %xmm3, %xmm6
+	movdqa	%xmm6, %xmm3
+	psrldq	\$1, %xmm2
+
+	# Load the next table row and index the low and high bits of the input.
+	# Note the low (respectively, high) half corresponds to more
+	# (respectively, less) significant coefficients.
+	movdqa	%xmm4, %xmm5
+	pshufb	%xmm0, %xmm4
+	pshufb	%xmm1, %xmm5
+
+	# Add the high half (%xmm5) without shifting.
+	pxor	%xmm5, %xmm2
+
+	# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
+	# add into the carry register (%xmm3).
+	movdqa	%xmm4, %xmm5
+	psllq	\$60, %xmm5
+	movdqa	%xmm5, %xmm6
+	pslldq	\$8, %xmm6
+	pxor	%xmm6, %xmm3
+
+	# Next, add into %xmm2.
+	psrldq	\$8, %xmm5
+	pxor	%xmm5, %xmm2
+	psrlq	\$4, %xmm4
+	pxor	%xmm4, %xmm2
+
+	subq	\$1, %rax
+	jnz	.Loop_row_$call_counter
+
+	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
+	# x^7, so we shift and XOR four times.
+	pxor	%xmm3, %xmm2	# x^0 = 0
+	psrlq	\$1, %xmm3
+	pxor	%xmm3, %xmm2	# x^1 = x
+	psrlq	\$1, %xmm3
+	pxor	%xmm3, %xmm2	# x^(1+1) = x^2
+	psrlq	\$5, %xmm3
+	pxor	%xmm3, %xmm2	# x^(1+1+5) = x^7
+	pxor	%xmm3, %xmm3
+____
+}
+
+# We must reduce at least once every 7 rows, so divide into three chunks.
+$code .= process_rows(5);
+$code .= process_rows(5);
+$code .= process_rows(6);
+
+$code .= <<____;
+	# Store the result. Reverse bytes to serialize.
+	pshufb	%xmm10, %xmm2
+	movdqu	%xmm2, ($Xi)
+
+	# Zero any registers which contain secrets.
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+____
+$code .= <<____ if ($win64);
+	movdqa	(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm10
+	addq	\$40, %rsp
+____
+$code .= <<____;
+	ret
+.Lgmult_seh_end:
+.cfi_endproc
+.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
+____
+
+$code .= <<____;
+# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
+# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
+# serialized byte representation. |Htable| is formatted as described above.
+# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+#                      size_t len);
+.type	gcm_ghash_ssse3, \@abi-omnipotent
+.globl	gcm_ghash_ssse3
+.align	16
+gcm_ghash_ssse3:
+.Lghash_seh_begin:
+.cfi_startproc
+____
+$code .= <<____ if ($win64);
+	subq	\$56, %rsp
+.Lghash_seh_allocstack:
+	movdqa	%xmm6, (%rsp)
+.Lghash_seh_save_xmm6:
+	movdqa	%xmm10, 16(%rsp)
+.Lghash_seh_save_xmm10:
+	movdqa	%xmm11, 32(%rsp)
+.Lghash_seh_save_xmm11:
+.Lghash_seh_prolog_end:
+____
+$code .= <<____;
+	movdqu	($Xi), %xmm0
+	movdqa	.Lreverse_bytes(%rip), %xmm10
+	movdqa	.Llow4_mask(%rip), %xmm11
+
+	# This function only processes whole blocks.
+	andq	\$-16, $len
+
+	# Reverse input bytes to deserialize. We maintain the running
+	# total in %xmm0.
+	pshufb	%xmm10, %xmm0
+
+	# Iterate over each block. On entry to each iteration, %xmm3 is zero.
+	pxor	%xmm3, %xmm3
+.Loop_ghash:
+	# Incorporate the next block of input.
+	movdqu	($in), %xmm1
+	pshufb	%xmm10, %xmm1	# Reverse bytes.
+	pxor	%xmm1, %xmm0
+
+	# Split each byte into low (%xmm0) and high (%xmm1) halves.
+	movdqa	%xmm11, %xmm1
+	pandn	%xmm0, %xmm1
+	psrld	\$4, %xmm1
+	pand	%xmm11, %xmm0
+
+	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
+	# that, due to bit reversal, %xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	pxor	%xmm2, %xmm2
+	# %xmm3 is already zero at this point.
+____
+
+# We must reduce at least once every 7 rows, so divide into three chunks.
+$code .= process_rows(5);
+$code .= process_rows(5);
+$code .= process_rows(6);
+
+$code .= <<____;
+	movdqa	%xmm2, %xmm0
+
+	# Rewind $Htable for the next iteration.
+	leaq	-256($Htable), $Htable
+
+	# Advance input and continue.
+	leaq	16($in), $in
+	subq	\$16, $len
+	jnz	.Loop_ghash
+
+	# Reverse bytes and store the result.
+	pshufb	%xmm10, %xmm0
+	movdqu	%xmm0, ($Xi)
+
+	# Zero any registers which contain secrets.
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+____
+$code .= <<____ if ($win64);
+	movdqa	(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm10
+	movdqa	32(%rsp), %xmm11
+	addq	\$56, %rsp
+____
+$code .= <<____;
+	ret
+.Lghash_seh_end:
+.cfi_endproc
+.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.align	16
+# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
+# bytes in an XMM register.
+.Lreverse_bytes:
+.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
+.Llow4_mask:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+____
+
+if ($win64) {
+  # Add unwind metadata for SEH.
+  #
+  # TODO(davidben): This is all manual right now. Once we've added SEH tests,
+  # add support for emitting these in x86_64-xlate.pl, probably based on MASM
+  # and Yasm's unwind directives, and unify with CFI. Then upstream it to
+  # replace the error-prone and non-standard custom handlers.
+
+  # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
+  my $UWOP_ALLOC_SMALL = 2;
+  my $UWOP_SAVE_XMM128 = 8;
+
+  $code .= <<____;
+.section	.pdata
+.align	4
+	.rva	.Lgmult_seh_begin
+	.rva	.Lgmult_seh_end
+	.rva	.Lgmult_seh_info
+
+	.rva	.Lghash_seh_begin
+	.rva	.Lghash_seh_end
+	.rva	.Lghash_seh_info
+
+.section	.xdata
+.align	8
+.Lgmult_seh_info:
+	.byte	1	# version 1, no flags
+	.byte	.Lgmult_seh_prolog_end-.Lgmult_seh_begin
+	.byte	5	# num_slots = 1 + 2 + 2
+	.byte	0	# no frame register
+
+	.byte	.Lgmult_seh_allocstack-.Lgmult_seh_begin
+	.byte	@{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}
+
+	.byte	.Lgmult_seh_save_xmm6-.Lgmult_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
+	.value	0
+
+	.byte	.Lgmult_seh_save_xmm10-.Lgmult_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
+	.value	1
+
+.align	8
+.Lghash_seh_info:
+	.byte	1	# version 1, no flags
+	.byte	.Lghash_seh_prolog_end-.Lghash_seh_begin
+	.byte	7	# num_slots = 1 + 2 + 2 + 2
+	.byte	0	# no frame register
+
+	.byte	.Lghash_seh_allocstack-.Lghash_seh_begin
+	.byte	@{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
+
+	.byte	.Lghash_seh_save_xmm6-.Lghash_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
+	.value	0
+
+	.byte	.Lghash_seh_save_xmm10-.Lghash_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
+	.value	1
+
+	.byte	.Lghash_seh_save_xmm11-.Lghash_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (11 << 4)]}
+	.value	2
+____
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 2a450cd..681f7a9 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -243,6 +243,33 @@
 #define GHASH_CHUNK (3 * 1024)
 #endif  // GHASH_ASM
 
+#if defined(GHASH_ASM_X86_64)
+void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
+  // Run the existing 4-bit version.
+  gcm_init_4bit(Htable, Xi);
+
+  // First, swap hi and lo. The "4bit" version places hi first. It treats the
+  // two fields separately, so the order does not matter, but ghash-ssse3 reads
+  // the entire state into one 128-bit register.
+  for (int i = 0; i < 16; i++) {
+    uint64_t tmp = Htable[i].hi;
+    Htable[i].hi = Htable[i].lo;
+    Htable[i].lo = tmp;
+  }
+
+  // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i]
+  // contains the i'th byte of j*H for all j.
+  uint8_t *Hbytes = (uint8_t *)Htable;
+  for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < i; j++) {
+      uint8_t tmp = Hbytes[16*i + j];
+      Hbytes[16*i + j] = Hbytes[16*j + i];
+      Hbytes[16*j + i] = tmp;
+    }
+  }
+}
+#endif  // GHASH_ASM_X86_64
+
 #ifdef GCM_FUNCREF_4BIT
 #undef GCM_MUL
 #define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable)
@@ -285,6 +312,12 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
+  if (gcm_ssse3_capable()) {
+    gcm_init_ssse3(out_table, H.u);
+    *out_mult = gcm_gmult_ssse3;
+    *out_hash = gcm_ghash_ssse3;
+    return;
+  }
 #elif defined(GHASH_ASM_X86)
   if (crypto_gcm_clmul_enabled()) {
     gcm_init_clmul(out_table, H.u);
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index fb17bbb..54827ca 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -61,6 +61,12 @@
 #include "../../test/file_test.h"
 #include "../../test/test_util.h"
 
+#if defined(OPENSSL_WINDOWS)
+OPENSSL_MSVC_PRAGMA(warning(push, 3))
+#include <windows.h>
+OPENSSL_MSVC_PRAGMA(warning(pop))
+#endif
+
 
 TEST(GCMTest, TestVectors) {
   FileTestGTest("crypto/fipsmodule/modes/gcm_tests.txt", [](FileTest *t) {
@@ -133,13 +139,21 @@
       UINT64_C(0xf328c2b971b2fe78),
   };
 
-  u128 Htable[16];
+  alignas(16) u128 Htable[16];
   CHECK_ABI(gcm_init_4bit, Htable, kH);
   CHECK_ABI(gcm_gmult_4bit, X, Htable);
   for (size_t blocks : kBlockCounts) {
     CHECK_ABI(gcm_ghash_4bit, X, Htable, buf, 16 * blocks);
   }
 
+  if (gcm_ssse3_capable()) {
+    CHECK_ABI(gcm_init_ssse3, Htable, kH);
+    CHECK_ABI(gcm_gmult_ssse3, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks);
+    }
+  }
+
   if (crypto_gcm_clmul_enabled()) {
     CHECK_ABI(gcm_init_clmul, Htable, kH);
     CHECK_ABI(gcm_gmult_clmul, X, Htable);
@@ -156,4 +170,38 @@
     }
   }
 }
+
+#if defined(OPENSSL_WINDOWS)
+// Sanity-check the SEH unwind codes in ghash-ssse3-x86_64.pl.
+// TODO(davidben): Implement unwind testing for SEH and remove this.
+static void GCMSSSE3ExceptionTest() {
+  if (!gcm_ssse3_capable()) {
+    return;
+  }
+
+  bool handled = false;
+  __try {
+    gcm_gmult_ssse3(nullptr, nullptr);
+  } __except (GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION
+                  ? EXCEPTION_EXECUTE_HANDLER
+                  : EXCEPTION_CONTINUE_SEARCH) {
+    handled = true;
+  }
+  EXPECT_TRUE(handled);
+
+  handled = false;
+  __try {
+    gcm_ghash_ssse3(nullptr, nullptr, nullptr, 16);
+  } __except (GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION
+                  ? EXCEPTION_EXECUTE_HANDLER
+                  : EXCEPTION_CONTINUE_SEARCH) {
+    handled = true;
+  }
+  EXPECT_TRUE(handled);
+}
+
+TEST(GCMTest, SEH) {
+  CHECK_ABI_NO_UNWIND(GCMSSSE3ExceptionTest);
+}
+#endif  // OPENSSL_WINDOWS
 #endif  // GHASH_ASM_X86_64 && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 3163c50..79a0951 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -159,7 +159,10 @@
 
 typedef struct gcm128_key_st {
   // Note the MOVBE-based, x86-64, GHASH assembly requires |H| and |Htable| to
-  // be the first two elements of this struct.
+  // be the first two elements of this struct. Additionally, some assembly
+  // routines require a 16-byte-aligned |Htable| when hashing data, but not
+  // initialization. |GCM128_KEY| is not itself aligned to simplify embedding in
+  // |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16.
   u128 H;
   u128 Htable[16];
   gmult_func gmult;
@@ -184,8 +187,10 @@
   } Yi, EKi, EK0, len, Xi;
 
   // Note that the order of |Xi| and |gcm_key| is fixed by the MOVBE-based,
-  // x86-64, GHASH assembly.
-  GCM128_KEY gcm_key;
+  // x86-64, GHASH assembly. Additionally, some assembly routines require
+  // |gcm_key| to be 16-byte aligned. |GCM128_KEY| is not itself aligned to
+  // simplify embedding in |EVP_AEAD_CTX|.
+  alignas(16) GCM128_KEY gcm_key;
 
   unsigned mres, ares;
 } GCM128_CONTEXT;
@@ -295,6 +300,18 @@
 void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
                    size_t len);
+
+OPENSSL_INLINE char gcm_ssse3_capable(void) {
+  return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
+}
+
+// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
+// 16-byte-aligned, but |gcm_init_ssse3| does not.
+void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+                     size_t len);
+
 #define AESNI_GCM
 size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi);
@@ -472,10 +489,11 @@
 
 struct polyval_ctx {
   // Note that the order of |S|, |H| and |Htable| is fixed by the MOVBE-based,
-  // x86-64, GHASH assembly.
+  // x86-64, GHASH assembly. Additionally, some assembly routines require
+  // |Htable| to be 16-byte aligned.
   polyval_block S;
   u128 H;
-  u128 Htable[16];
+  alignas(16) u128 Htable[16];
   gmult_func gmult;
   ghash_func ghash;
 };