Add bn_mul_small and bn_sqr_small.

As part of excising BIGNUM from EC scalars, we will need a "words"
version of BN_mod_mul_montgomery. That, in turn, requires BN_sqr and
BN_mul for cases where we don't have bn_mul_mont.

BN_sqr and BN_mul have a lot of logic in there, with the most complex
cases being not even remotely constant time. Fortunately, those only
apply to RSA-sized numbers, not EC-sized numbers. (With the exception, I
believe, of 32-bit P-521 which just barely exceeds the cutoff.) Imposing
a limit also makes it easier to stack-allocate temporaries (BN_CTX
serves a similar purpose in BIGNUM).

Extract bn_mul_small and bn_sqr_small and test them as part of
bn_tests.txt. Later changes will build on these.

If we end up reusing these functions for RSA in the future (though that
would require tending to the egregiously non-constant-time code in the
no-asm build), we probably want to extract a version where there is an
explicit tmp parameter as in bn_sqr_normal rather than the stack bits.

Change-Id: If414981eefe12d6664ab2f5e991a359534aa7532
Reviewed-on: https://boringssl-review.googlesource.com/23068
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index 5725eaa..7376989 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -357,9 +357,11 @@
   ASSERT_TRUE(BN_mul(ret.get(), a.get(), a.get(), ctx));
   EXPECT_BIGNUMS_EQUAL("A * A", square.get(), ret.get());
 
-  ASSERT_TRUE(BN_div(ret.get(), remainder.get(), square.get(), a.get(), ctx));
-  EXPECT_BIGNUMS_EQUAL("Square / A", a.get(), ret.get());
-  EXPECT_BIGNUMS_EQUAL("Square % A", zero.get(), remainder.get());
+  if (!BN_is_zero(a.get())) {
+    ASSERT_TRUE(BN_div(ret.get(), remainder.get(), square.get(), a.get(), ctx));
+    EXPECT_BIGNUMS_EQUAL("Square / A", a.get(), ret.get());
+    EXPECT_BIGNUMS_EQUAL("Square % A", zero.get(), remainder.get());
+  }
 
   BN_set_negative(a.get(), 0);
   ASSERT_TRUE(BN_sqrt(ret.get(), square.get(), ctx));
@@ -382,6 +384,31 @@
         << "BN_sqrt succeeded on a non-square";
     ERR_clear_error();
   }
+
+#if !defined(BORINGSSL_SHARED_LIBRARY)
+  if (static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+      SCOPED_TRACE(num_a);
+      size_t num_r = 2 * num_a;
+      // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
+      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
+          r_words(new BN_ULONG[num_r]);
+      OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
+      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
+
+      ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
+                               a_words.get(), num_a));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
+      EXPECT_BIGNUMS_EQUAL("A * A (words)", square.get(), ret.get());
+
+      OPENSSL_memset(r_words.get(), 'A', num_r * sizeof(BN_ULONG));
+      ASSERT_TRUE(bn_sqr_small(r_words.get(), num_r, a_words.get(), num_a));
+
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
+      EXPECT_BIGNUMS_EQUAL("A^2 (words)", square.get(), ret.get());
+    }
+  }
+#endif
 }
 
 static void TestProduct(FileTest *t, BN_CTX *ctx) {
@@ -402,13 +429,46 @@
   ASSERT_TRUE(BN_mul(ret.get(), a.get(), b.get(), ctx));
   EXPECT_BIGNUMS_EQUAL("A * B", product.get(), ret.get());
 
-  ASSERT_TRUE(BN_div(ret.get(), remainder.get(), product.get(), a.get(), ctx));
-  EXPECT_BIGNUMS_EQUAL("Product / A", b.get(), ret.get());
-  EXPECT_BIGNUMS_EQUAL("Product % A", zero.get(), remainder.get());
+  if (!BN_is_zero(a.get())) {
+    ASSERT_TRUE(
+        BN_div(ret.get(), remainder.get(), product.get(), a.get(), ctx));
+    EXPECT_BIGNUMS_EQUAL("Product / A", b.get(), ret.get());
+    EXPECT_BIGNUMS_EQUAL("Product % A", zero.get(), remainder.get());
+  }
 
-  ASSERT_TRUE(BN_div(ret.get(), remainder.get(), product.get(), b.get(), ctx));
-  EXPECT_BIGNUMS_EQUAL("Product / B", a.get(), ret.get());
-  EXPECT_BIGNUMS_EQUAL("Product % B", zero.get(), remainder.get());
+  if (!BN_is_zero(b.get())) {
+    ASSERT_TRUE(
+        BN_div(ret.get(), remainder.get(), product.get(), b.get(), ctx));
+    EXPECT_BIGNUMS_EQUAL("Product / B", a.get(), ret.get());
+    EXPECT_BIGNUMS_EQUAL("Product % B", zero.get(), remainder.get());
+  }
+
+#if !defined(BORINGSSL_SHARED_LIBRARY)
+  if (!BN_is_negative(product.get()) &&
+      static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS &&
+      static_cast<size_t>(b->top) <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+      SCOPED_TRACE(num_a);
+      for (size_t num_b = b->top; num_b <= BN_SMALL_MAX_WORDS; num_b++) {
+        SCOPED_TRACE(num_b);
+        size_t num_r = num_a + num_b;
+        // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
+        std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
+            b_words(new BN_ULONG[num_b]), r_words(new BN_ULONG[num_r]);
+        OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
+        OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
+
+        OPENSSL_memset(b_words.get(), 0, num_b * sizeof(BN_ULONG));
+        OPENSSL_memcpy(b_words.get(), b->d, b->top * sizeof(BN_ULONG));
+
+        ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
+                                 b_words.get(), num_b));
+        ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
+        EXPECT_BIGNUMS_EQUAL("A * B (words)", product.get(), ret.get());
+      }
+    }
+  }
+#endif
 }
 
 static void TestQuotient(FileTest *t, BN_CTX *ctx) {
diff --git a/crypto/fipsmodule/bn/bn_tests.txt b/crypto/fipsmodule/bn/bn_tests.txt
index f809e7e..eb447b5 100644
--- a/crypto/fipsmodule/bn/bn_tests.txt
+++ b/crypto/fipsmodule/bn/bn_tests.txt
@@ -5349,6 +5349,12 @@
 Square = eea8028b26e0df090504d54da714a6f5f2695202e53cff479c78aedd47a8dc676243ec586740fde53b3eca9ca02b91031ce766242184109503fbe25b1b6d318e3cd5970fabd16dfa22984dd2e9f1e0f14c189170fc69c031d66663703e6235a942d51a4545bd7b0769d01d302ce2b00b83f01568a1e378f61fd0ca6201b0490330580cd9de85719e174a71915d7efbf65cd73d8f4e66f27e0dd3144d58ec09ed0f7ed7d1238ee596922807100fb7a11127944ddcdec6a9ca3bbf6df7301e354f3f049bfb7c275b43c3d8cda5907a932fba507c9145ea3166081c1b48fcc710ee32cd931f936c796b14f8a78a592e67753a7c9e428a01719c8ba82652f3a89fae110
 A = -3dcb44be1e54c5a5d7db48055ca9afa1ebe2ae648aa6e16ac497502a7deee09ffa124720fad0ab163ce8b3ea6a90f110ea52b67dbc424d0cf1e8c9726dfd9e45bebcefaa5cd5706edeed27896525f31c6bbea3d67ee97badefabf3e2532470b66e3ae3100f66ddf50cf02fc3a8e3f44c304251d3b6a7ca3a6e4bd5d16a41bd97a4
 
+Square = 0
+A = 0
+
+Square = 1
+A = 1
+
 
 # Product tests.
 #
@@ -5954,6 +5960,22 @@
 A = a57da276998c548101f514e9f
 B = -542fb814f45924aa09a16f2a6
 
+Product = 0
+A = 0
+B = 542fb814f45924aa09a16f2a6
+
+Product = 0
+A = 542fb814f45924aa09a16f2a6
+B = 0
+
+Product = 542fb814f45924aa09a16f2a6
+A = 1
+B = 542fb814f45924aa09a16f2a6
+
+Product = 542fb814f45924aa09a16f2a6
+A = 542fb814f45924aa09a16f2a6
+B = 1
+
 
 # Quotient tests.
 #
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index fa4b54e..634435f 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -308,6 +308,34 @@
 int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
 
 
+// Low-level operations for small numbers.
+//
+// The following functions implement algorithms suitable for use with scalars
+// and field elements in elliptic curves. They rely on the number being small
+// both to stack-allocate various temporaries and because they do not implement
+// optimizations useful for the larger values used in RSA.
+
+// BN_SMALL_MAX_WORDS is the largest size input these functions handle. This
+// limit allows temporaries to be more easily stack-allocated. This limit is set
+// to accommodate P-521.
+#if defined(OPENSSL_32_BIT)
+#define BN_SMALL_MAX_WORDS 17
+#else
+#define BN_SMALL_MAX_WORDS 9
+#endif
+
+// bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
+// not alias with |a| or |b|. This function returns one on success and zero if
+// lengths are inconsistent.
+int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                 const BN_ULONG *b, size_t num_b);
+
+// bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
+// |num_r| must be |num_a|*2. |r| and |a| may not alias. This function returns
+// one on success and zero on programmer error.
+int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
+
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index 9f67226..3234e22 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -59,6 +59,8 @@
 #include <assert.h>
 #include <string.h>
 
+#include <openssl/err.h>
+
 #include "internal.h"
 #include "../../internal.h"
 
@@ -654,6 +656,22 @@
   return ret;
 }
 
+int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                 const BN_ULONG *b, size_t num_b) {
+  if (num_r != num_a + num_b) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+  // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not
+  // hit that code.
+  if (num_a == 8 && num_b == 8) {
+    bn_mul_comba8(r, a, b);
+  } else {
+    bn_mul_normal(r, a, num_a, b, num_b);
+  }
+  return 1;
+}
+
 // tmp must have 2*n words
 static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n,
                           BN_ULONG *tmp) {
@@ -864,3 +882,20 @@
   BN_CTX_end(ctx);
   return ret;
 }
+
+int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
+  if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+  if (num_a == 4) {
+    bn_sqr_comba4(r, a);
+  } else if (num_a == 8) {
+    bn_sqr_comba8(r, a);
+  } else {
+    BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
+    bn_sqr_normal(r, a, num_a, tmp);
+    OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG));
+  }
+  return 1;
+}