Always define GHASH.
There is a C implementation of gcm_ghash_4bit to pair with
gcm_gmult_4bit. It's even slightly faster per the numbers below (x86_64
OPENSSL_NO_ASM build), but, more importantly, we trim down the
combinatorial explosion of GCM implementations and free up complexity
budget for potentially using bsaes better in the future.
Old:
Did 2557000 AES-128-GCM (16 bytes) seal operations in 1000057us (2556854.3 ops/sec): 40.9 MB/s
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009613us (93105.0 ops/sec): 125.7 MB/s
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1024768us (16589.1 ops/sec): 135.9 MB/s
Did 2511000 AES-256-GCM (16 bytes) seal operations in 1000196us (2510507.9 ops/sec): 40.2 MB/s
Did 84000 AES-256-GCM (1350 bytes) seal operations in 1000412us (83965.4 ops/sec): 113.4 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1046963us (14327.2 ops/sec): 117.4 MB/s
New:
Did 2739000 AES-128-GCM (16 bytes) seal operations in 1000322us (2738118.3 ops/sec): 43.8 MB/s
Did 100000 AES-128-GCM (1350 bytes) seal operations in 1008190us (99187.7 ops/sec): 133.9 MB/s
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1006360us (16892.6 ops/sec): 138.4 MB/s
Did 2546000 AES-256-GCM (16 bytes) seal operations in 1000150us (2545618.2 ops/sec): 40.7 MB/s
Did 86000 AES-256-GCM (1350 bytes) seal operations in 1000970us (85916.7 ops/sec): 116.0 MB/s
Did 14850 AES-256-GCM (8192 bytes) seal operations in 1023459us (14509.6 ops/sec): 118.9 MB/s
While I'm here, tighten up some of the functions and align the ctr32 and
non-ctr32 paths.
Bug: 256
Change-Id: Id4df699cefc8630dd5a350d44f927900340f5e60
Reviewed-on: https://boringssl-review.googlesource.com/c/34869
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 681f7a9..97fde3e 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -234,14 +234,12 @@
#endif // !GHASH_ASM || AARCH64 || PPC64LE
#define GCM_MUL(ctx, Xi) gcm_gmult_4bit((ctx)->Xi.u, (ctx)->gcm_key.Htable)
-#if defined(GHASH_ASM)
#define GHASH(ctx, in, len) \
gcm_ghash_4bit((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
// GHASH_CHUNK is "stride parameter" missioned to mitigate cache
// trashing effect. In other words idea is to hash data while it's
// still in L1 cache after encryption pass...
#define GHASH_CHUNK (3 * 1024)
-#endif // GHASH_ASM
#if defined(GHASH_ASM_X86_64)
void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
@@ -273,11 +271,9 @@
#ifdef GCM_FUNCREF_4BIT
#undef GCM_MUL
#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable)
-#ifdef GHASH
#undef GHASH
#define GHASH(ctx, in, len) \
(*gcm_ghash_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
-#endif
#endif // GCM_FUNCREF_4BIT
void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
@@ -376,7 +372,6 @@
void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *iv, size_t len) {
- unsigned int ctr;
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
@@ -391,6 +386,7 @@
ctx->ares = 0;
ctx->mres = 0;
+ uint32_t ctr;
if (len == 12) {
OPENSSL_memcpy(ctx->Yi.c, iv, 12);
ctx->Yi.c[15] = 1;
@@ -425,8 +421,6 @@
}
int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
- unsigned int n;
- uint64_t alen = ctx->len.u[0];
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
@@ -440,13 +434,13 @@
return 0;
}
- alen += len;
+ uint64_t alen = ctx->len.u[0] + len;
if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
return 0;
}
ctx->len.u[0] = alen;
- n = ctx->ares;
+ unsigned n = ctx->ares;
if (n) {
while (n && len) {
ctx->Xi.c[n] ^= *(aad++);
@@ -462,23 +456,12 @@
}
// Process a whole number of blocks.
-#ifdef GHASH
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
GHASH(ctx, aad, len_blocks);
aad += len_blocks;
len -= len_blocks;
}
-#else
- while (len >= 16) {
- for (size_t i = 0; i < 16; ++i) {
- ctx->Xi.c[i] ^= aad[i];
- }
- GCM_MUL(ctx, Xi);
- aad += 16;
- len -= 16;
- }
-#endif
// Process the remainder.
if (len != 0) {
@@ -494,19 +477,15 @@
int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len) {
- unsigned int n, ctr;
- uint64_t mlen = ctx->len.u[1];
block128_f block = ctx->gcm_key.block;
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
-#ifdef GHASH
void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
-#endif
- mlen += len;
+ uint64_t mlen = ctx->len.u[1] + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
@@ -519,9 +498,7 @@
ctx->ares = 0;
}
- ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-
- n = ctx->mres;
+ unsigned n = ctx->mres;
if (n) {
while (n && len) {
ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
@@ -535,6 +512,8 @@
return 1;
}
}
+
+ uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
if (STRICT_ALIGNMENT &&
((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
for (size_t i = 0; i < len; ++i) {
@@ -553,7 +532,6 @@
ctx->mres = n;
return 1;
}
-#if defined(GHASH) && defined(GHASH_CHUNK)
while (len >= GHASH_CHUNK) {
size_t j = GHASH_CHUNK;
@@ -588,22 +566,6 @@
}
GHASH(ctx, out - len_blocks, len_blocks);
}
-#else
- while (len >= 16) {
- (*block)(ctx->Yi.c, ctx->EKi.c, key);
- ++ctr;
- ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
- for (size_t i = 0; i < 16; i += sizeof(size_t)) {
- size_t tmp = load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)];
- store_word_le(out + i, tmp);
- ctx->Xi.t[i / sizeof(size_t)] ^= tmp;
- }
- GCM_MUL(ctx, Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
if (len) {
(*block)(ctx->Yi.c, ctx->EKi.c, key);
++ctr;
@@ -621,19 +583,15 @@
int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
const unsigned char *in, unsigned char *out,
size_t len) {
- unsigned int n, ctr;
- uint64_t mlen = ctx->len.u[1];
block128_f block = ctx->gcm_key.block;
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
-#ifdef GHASH
void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
-#endif
- mlen += len;
+ uint64_t mlen = ctx->len.u[1] + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
@@ -646,9 +604,7 @@
ctx->ares = 0;
}
- ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-
- n = ctx->mres;
+ unsigned n = ctx->mres;
if (n) {
while (n && len) {
uint8_t c = *(in++);
@@ -664,6 +620,8 @@
return 1;
}
}
+
+ uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
if (STRICT_ALIGNMENT &&
((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
for (size_t i = 0; i < len; ++i) {
@@ -685,7 +643,6 @@
ctx->mres = n;
return 1;
}
-#if defined(GHASH) && defined(GHASH_CHUNK)
while (len >= GHASH_CHUNK) {
size_t j = GHASH_CHUNK;
@@ -720,22 +677,6 @@
len -= 16;
}
}
-#else
- while (len >= 16) {
- (*block)(ctx->Yi.c, ctx->EKi.c, key);
- ++ctr;
- ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
- for (size_t i = 0; i < 16; i += sizeof(size_t)) {
- size_t c = load_word_le(in + i);
- store_word_le(out + i, c ^ ctx->EKi.t[i / sizeof(size_t)]);
- ctx->Xi.t[i / sizeof(size_t)] ^= c;
- }
- GCM_MUL(ctx, Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
if (len) {
(*block)(ctx->Yi.c, ctx->EKi.c, key);
++ctr;
@@ -755,18 +696,14 @@
int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len,
ctr128_f stream) {
- unsigned int n, ctr;
- uint64_t mlen = ctx->len.u[1];
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
-#ifdef GHASH
void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
-#endif
- mlen += len;
+ uint64_t mlen = ctx->len.u[1] + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
@@ -779,7 +716,7 @@
ctx->ares = 0;
}
- n = ctx->mres;
+ unsigned n = ctx->mres;
if (n) {
while (n && len) {
ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
@@ -805,9 +742,7 @@
}
#endif
- ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-
-#if defined(GHASH)
+ uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
while (len >= GHASH_CHUNK) {
(*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
ctr += GHASH_CHUNK / 16;
@@ -817,28 +752,17 @@
in += GHASH_CHUNK;
len -= GHASH_CHUNK;
}
-#endif
- size_t i = len & kSizeTWithoutLower4Bits;
- if (i != 0) {
- size_t j = i / 16;
+ size_t len_blocks = len & kSizeTWithoutLower4Bits;
+ if (len_blocks != 0) {
+ size_t j = len_blocks / 16;
(*stream)(in, out, j, key, ctx->Yi.c);
ctr += (unsigned int)j;
ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
- in += i;
- len -= i;
-#if defined(GHASH)
- GHASH(ctx, out, i);
- out += i;
-#else
- while (j--) {
- for (i = 0; i < 16; ++i) {
- ctx->Xi.c[i] ^= out[i];
- }
- GCM_MUL(ctx, Xi);
- out += 16;
- }
-#endif
+ in += len_blocks;
+ len -= len_blocks;
+ GHASH(ctx, out, len_blocks);
+ out += len_blocks;
}
if (len) {
(*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
@@ -857,18 +781,14 @@
int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len,
ctr128_f stream) {
- unsigned int n, ctr;
- uint64_t mlen = ctx->len.u[1];
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
-#ifdef GHASH
void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
-#endif
- mlen += len;
+ uint64_t mlen = ctx->len.u[1] + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
@@ -881,7 +801,7 @@
ctx->ares = 0;
}
- n = ctx->mres;
+ unsigned n = ctx->mres;
if (n) {
while (n && len) {
uint8_t c = *(in++);
@@ -909,9 +829,7 @@
}
#endif
- ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-
-#if defined(GHASH)
+ uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
while (len >= GHASH_CHUNK) {
GHASH(ctx, in, GHASH_CHUNK);
(*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
@@ -921,31 +839,17 @@
in += GHASH_CHUNK;
len -= GHASH_CHUNK;
}
-#endif
- size_t i = len & kSizeTWithoutLower4Bits;
- if (i != 0) {
- size_t j = i / 16;
+ size_t len_blocks = len & kSizeTWithoutLower4Bits;
+ if (len_blocks != 0) {
+ size_t j = len_blocks / 16;
-#if defined(GHASH)
- GHASH(ctx, in, i);
-#else
- while (j--) {
- size_t k;
- for (k = 0; k < 16; ++k) {
- ctx->Xi.c[k] ^= in[k];
- }
- GCM_MUL(ctx, Xi);
- in += 16;
- }
- j = i / 16;
- in -= i;
-#endif
+ GHASH(ctx, in, len_blocks);
(*stream)(in, out, j, key, ctx->Yi.c);
ctr += (unsigned int)j;
ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
- out += i;
- in += i;
- len -= i;
+ out += len_blocks;
+ in += len_blocks;
+ len -= len_blocks;
}
if (len) {
(*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
@@ -964,8 +868,6 @@
}
int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
- uint64_t alen = ctx->len.u[0] << 3;
- uint64_t clen = ctx->len.u[1] << 3;
#ifdef GCM_FUNCREF_4BIT
void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
ctx->gcm_key.gmult;
@@ -975,11 +877,8 @@
GCM_MUL(ctx, Xi);
}
- alen = CRYPTO_bswap8(alen);
- clen = CRYPTO_bswap8(clen);
-
- ctx->Xi.u[0] ^= alen;
- ctx->Xi.u[1] ^= clen;
+ ctx->Xi.u[0] ^= CRYPTO_bswap8(ctx->len.u[0] << 3);
+ ctx->Xi.u[1] ^= CRYPTO_bswap8(ctx->len.u[1] << 3);
GCM_MUL(ctx, Xi);
ctx->Xi.u[0] ^= ctx->EK0.u[0];