Shave 8 bytes off BN_MONT_CTX in 64-bit

We allocate two words in n0, but 64-bit platforms only need one word.

Change-Id: Ia2d53f88a9098d326dfbd79f9e59eb390afefad1
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/80207
Reviewed-by: Adam Langley <agl@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index 10fd3b4..7c1ad40 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -603,8 +603,9 @@
     bssl::UniquePtr<BN_MONT_CTX> mont2(BN_MONT_CTX_new_consttime(m.get(), ctx));
     ASSERT_TRUE(mont2);
     EXPECT_BIGNUMS_EQUAL("RR (mod M) (constant-time)", &mont->RR, &mont2->RR);
-    EXPECT_EQ(mont->n0[0], mont2->n0[0]);
-    EXPECT_EQ(mont->n0[1], mont2->n0[1]);
+    for (size_t i = 0; i < std::size(mont->n0); i++) {
+      EXPECT_EQ(mont->n0[i], mont2->n0[i]);
+    }
 
     bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(a_tmp);
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 1ec2562..b3e2331 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -47,7 +47,6 @@
 #define BN_MASK2l (0xffffffffUL)
 #define BN_MASK2h (0xffffffff00000000UL)
 #define BN_MASK2h1 (0xffffffff80000000UL)
-#define BN_MONT_CTX_N0_LIMBS 1
 #define BN_DEC_CONV (10000000000000000000UL)
 #define BN_DEC_NUM 19
 #define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
@@ -64,12 +63,6 @@
 #define BN_MASK2l (0xffffUL)
 #define BN_MASK2h1 (0xffff8000UL)
 #define BN_MASK2h (0xffff0000UL)
-// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
-// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
-// needs to be two words long. Only certain 32-bit platforms actually make use
-// of n0[1] and shorter R value would suffice for the others. However,
-// currently only the assembly files know which is which.
-#define BN_MONT_CTX_N0_LIMBS 2
 #define BN_DEC_CONV (1000000000UL)
 #define BN_DEC_NUM 9
 #define TOBN(hi, lo) (lo), (hi)
@@ -296,7 +289,8 @@
 // See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
 // inputs.
 void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                 const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                 const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                 size_t num);
 
 #if defined(OPENSSL_X86_64)
 inline int bn_mulx_adx_capable(void) {
@@ -304,30 +298,36 @@
   return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable();
 }
 void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                      const BN_ULONG *np,
+                      const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], size_t num);
 inline int bn_mul4x_mont_capable(size_t num) {
   return num >= 8 && (num & 3) == 0;
 }
 void bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                   const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                   size_t num);
 inline int bn_mulx4x_mont_capable(size_t num) {
   return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable();
 }
 void bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                    const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                    const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                    size_t num);
 inline int bn_sqr8x_mont_capable(size_t num) {
   return num >= 8 && (num & 7) == 0;
 }
 void bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
-                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                   const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                   size_t num);
 #elif defined(OPENSSL_ARM)
 inline int bn_mul8x_mont_neon_capable(size_t num) {
   return (num & 7) == 0 && CRYPTO_is_NEON_capable();
 }
 void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                        const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                        const BN_ULONG *np,
+                        const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], size_t num);
 void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+                      const BN_ULONG *np,
+                      const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], size_t num);
 #endif
 
 #endif  // OPENSSL_BN_ASM_MONT
@@ -340,7 +340,8 @@
 inline int bn_mul4x_mont_gather5_capable(int num) { return (num & 7) == 0; }
 void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
                            const BN_ULONG *table, const BN_ULONG *np,
-                           const BN_ULONG *n0, int num, int power);
+                           const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num,
+                           int power);
 
 inline int bn_mulx4x_mont_gather5_capable(int num) {
   return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() &&
@@ -348,11 +349,13 @@
 }
 void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
                             const BN_ULONG *table, const BN_ULONG *np,
-                            const BN_ULONG *n0, int num, int power);
+                            const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num,
+                            int power);
 
 void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap,
                               const BN_ULONG *table, const BN_ULONG *np,
-                              const BN_ULONG *n0, int num, int power);
+                              const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num,
+                              int power);
 
 // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
 // |table| are |num| words long. |power| must be less than 32 and is treated as
@@ -368,7 +371,8 @@
 
 // The following functions implement |bn_power5|. See |bn_power5| for details.
 void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
-                    const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+                    const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                    int num, int power);
 
 inline int bn_power5_capable(int num) { return (num & 7) == 0; }
 
@@ -377,7 +381,8 @@
          CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
 }
 void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
-                const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+                const BN_ULONG *np, const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS],
+                int num, int power);
 
 #endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64
 
diff --git a/crypto/fipsmodule/bn/montgomery.cc.inc b/crypto/fipsmodule/bn/montgomery.cc.inc
index aac5af8..fe5c3a2 100644
--- a/crypto/fipsmodule/bn/montgomery.cc.inc
+++ b/crypto/fipsmodule/bn/montgomery.cc.inc
@@ -64,8 +64,9 @@
   if (!BN_copy(&to->RR, &from->RR) || !BN_copy(&to->N, &from->N)) {
     return NULL;
   }
-  to->n0[0] = from->n0[0];
-  to->n0[1] = from->n0[1];
+  for (size_t i = 0; i < BN_MONT_CTX_N0_LIMBS; i++) {
+    to->n0[i] = from->n0[i];
+  }
   return to;
 }
 
@@ -111,8 +112,6 @@
   mont->n0[0] = (BN_ULONG)n0;
 #if BN_MONT_CTX_N0_LIMBS == 2
   mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2);
-#else
-  mont->n0[1] = 0;
 #endif
   return 1;
 }
diff --git a/include/openssl/bn.h b/include/openssl/bn.h
index 8209975..62b63dd 100644
--- a/include/openssl/bn.h
+++ b/include/openssl/bn.h
@@ -917,6 +917,19 @@
   int flags;
 };
 
+// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
+// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
+// needs to be two words long. Only certain 32-bit platforms actually make use
+// of n0[1] and shorter R value would suffice for the others. However,
+// currently only the assembly files know which is which.
+#if defined(OPENSSL_64_BIT)
+#define BN_MONT_CTX_N0_LIMBS 1
+#elif defined(OPENSSL_32_BIT)
+#define BN_MONT_CTX_N0_LIMBS 2
+#else
+#error "unknown bit size"
+#endif
+
 struct bn_mont_ctx_st {
   // RR is R^2, reduced modulo |N|. It is used to convert to Montgomery form. It
   // is guaranteed to have the same width as |N|.
@@ -924,7 +937,7 @@
   // N is the modulus. It is always stored in minimal form, so |N.width|
   // determines R.
   BIGNUM N;
-  BN_ULONG n0[2];  // least significant words of (R*Ri-1)/N
+  BN_ULONG n0[BN_MONT_CTX_N0_LIMBS];  // least significant words of (R*Ri-1)/N
 };
 
 OPENSSL_EXPORT unsigned BN_num_bits_word(BN_ULONG l);