Simplify mont5 table computation.

The unrolled loops appear to have negligible perf impact:

Before:
Did 18480 RSA 2048 signing operations in 10005085us (1847.1 ops/sec)
Did 2720 RSA 4096 signing operations in 10056337us (270.5 ops/sec)

After:
Did 18480 RSA 2048 signing operations in 10012218us (1845.7 ops/sec) [-0.1%]
Did 2700 RSA 4096 signing operations in 10003972us (269.9 ops/sec) [-0.2%]

Change-Id: I29073c373a03a9798f6e04016626e6ab910e893a
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52826
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index 38013ed..e2e0d12 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -1074,7 +1074,7 @@
       bn_scatter5(tmp.d, top, powerbuf, i);
     }
     // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|.
-    for (i = 3; i < 8; i += 2) {
+    for (i = 3; i < 32; i += 2) {
       bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
       bn_scatter5(tmp.d, top, powerbuf, i);
       for (int j = 2 * i; j < 32; j *= 2) {
@@ -1082,17 +1082,6 @@
         bn_scatter5(tmp.d, top, powerbuf, j);
       }
     }
-    // These two loops are the above with the inner loop unrolled.
-    for (; i < 16; i += 2) {
-      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
-      bn_scatter5(tmp.d, top, powerbuf, i);
-      bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-      bn_scatter5(tmp.d, top, powerbuf, 2 * i);
-    }
-    for (; i < 32; i += 2) {
-      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
-      bn_scatter5(tmp.d, top, powerbuf, i);
-    }
 
     bits--;
     for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) {