Fix comments now BN_mod_exp_mont_consttime is not cache-line-sensitive

BN_mod_exp_mont_consttime originally assumed accesses within a cache
line were indistinguishable and indexed into a cache line with secret
values. As a result, it required all of its tables, etc., to be
cache-line-aligned. Nowadays, the standard constant time memory model is
to assume the whole address leaks and not make these assumptions.

In particular, CacheBleed (CVE-2016-0702) showed this assumption was
false and which cache bank you accessed as leaked. OpenSSL's fix for the
assembly (mont5 and rsaz) appears to match the standard constant-time
model. However, its fix to the C code narrowed the assumption to cache
banks, so the alignment was still necessary.

After https://boringssl-review.googlesource.com/c/boringssl/+/33268, we
dropped this and use the standard model. All together, it should mean we
no longer make assumptions about cache lines. Update all the comments
and variable names accordingly.

Change-Id: I7bcb828eb2751a0167c3a3c8242b1b3971efc708
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/55227
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index a76d7cb..285c4dd 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -863,28 +863,13 @@
 // Window sizes optimized for fixed window size modular exponentiation
 // algorithm (BN_mod_exp_mont_consttime).
 //
-// To achieve the security goals of BN_mode_exp_mont_consttime, the maximum
-// size of the window must not exceed
-// log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
-//
-// Window size thresholds are defined for cache line sizes of 32 and 64, cache
-// line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of
-// 7 should only be used on processors that have a 128 byte or greater cache
-// line size.
-#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
-
+// TODO(davidben): These window sizes were originally set for 64-byte cache
+// lines with a cache-line-dependent constant-time mitigation. They can probably
+// be revised now that our implementation is no longer cache-time-dependent.
 #define BN_window_bits_for_ctime_exponent_size(b) \
   ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
 #define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
 
-#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
-
-#define BN_window_bits_for_ctime_exponent_size(b) \
-  ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
-#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
-
-#endif
-
 // This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access
 // patterns to protect secret exponents (cf. the hyper-threading timing attacks
 // pointed out by Colin Percival,
@@ -945,8 +930,7 @@
   // paths. If we were to use separate static buffers for each then there is
   // some chance that both large buffers would be allocated on the stack,
   // causing the stack space requirement to be truly huge (~10KB).
-  alignas(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH) BN_ULONG
-    storage[MOD_EXP_CTIME_STORAGE_LEN];
+  alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN];
 #endif
 #if defined(RSAZ_ENABLED)
   // If the size of the operands allow it, perform the optimized RSAZ
@@ -991,12 +975,11 @@
   assert(powerbuf != NULL || top * BN_BITS2 > 1024);
 #endif
   if (powerbuf == NULL) {
-    powerbufFree =
-        OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+    powerbufFree = OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_ALIGN);
     if (powerbufFree == NULL) {
       goto err;
     }
-    powerbuf = align_pointer(powerbufFree, MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+    powerbuf = align_pointer(powerbufFree, MOD_EXP_CTIME_ALIGN);
   }
   OPENSSL_memset(powerbuf, 0, powerbufLen);
 
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 9329ce7..f7adfe9 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -189,14 +189,20 @@
 #define BN_CAN_USE_INLINE_ASM
 #endif
 
-// |BN_mod_exp_mont_consttime| is based on the assumption that the L1 data
-// cache line width of the target processor is at least the following value.
-#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH 64
+// MOD_EXP_CTIME_ALIGN is the alignment needed for |BN_mod_exp_mont_consttime|'s
+// tables.
+//
+// TODO(davidben): Historically, this alignment came from cache line
+// assumptions, which we've since removed. Is 64-byte alignment still necessary
+// or ideal? The true alignment requirement seems to now be 32 bytes, coming
+// from RSAZ's use of VMOVDQA to a YMM register. Non-x86_64 has even fewer
+// requirements.
+#define MOD_EXP_CTIME_ALIGN 64
 
-// The number of |BN_ULONG|s needed for the |BN_mod_exp_mont_consttime| stack-
-// allocated storage buffer. The buffer is just the right size for the RSAZ
-// and is about ~1KB larger than what's necessary (4480 bytes) for 1024-bit
-// inputs.
+// MOD_EXP_CTIME_STORAGE_LEN is the number of |BN_ULONG|s needed for the
+// |BN_mod_exp_mont_consttime| stack-allocated storage buffer. The buffer is
+// just the right size for the RSAZ and is about ~1KB larger than what's
+// necessary (4480 bytes) for 1024-bit inputs.
 #define MOD_EXP_CTIME_STORAGE_LEN \
   (((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG))
 
diff --git a/crypto/fipsmodule/bn/rsaz_exp.c b/crypto/fipsmodule/bn/rsaz_exp.c
index 7b455b5..da25030 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.c
+++ b/crypto/fipsmodule/bn/rsaz_exp.c
@@ -40,8 +40,8 @@
                             const BN_ULONG m_norm[16], const BN_ULONG RR[16],
                             BN_ULONG k0,
                             BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) {
-  static_assert(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH % 64 == 0,
-                "MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH is too small");
+  static_assert(MOD_EXP_CTIME_ALIGN % 64 == 0,
+                "MOD_EXP_CTIME_ALIGN is too small");
   assert((uintptr_t)storage % 64 == 0);
 
   BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s;
diff --git a/crypto/fipsmodule/bn/rsaz_exp.h b/crypto/fipsmodule/bn/rsaz_exp.h
index 67f1cab..22ca3ec 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/crypto/fipsmodule/bn/rsaz_exp.h
@@ -32,8 +32,7 @@
 // modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
 // the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
 // respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
-// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH|
-// bytes.
+// temporary buffer that must be aligned to |MOD_EXP_CTIME_ALIGN| bytes.
 void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
                             const BN_ULONG exponent[16],
                             const BN_ULONG m_norm[16], const BN_ULONG RR[16],