Align rsaz and mont5 table construction.

Both implementations need to compute the first 32 powers of a. There's a
commented out naive version in rsaz_exp.c that claims to be smaller, but
1% slower. (It doesn't use squares when it otherwise could.)

Instead, we can write out the square-based strategy as a loop. (I wasn't
able to measure a difference between any of the three versions, but this
one's compact enough and does let us square more and gather5 less.)

Change-Id: I7015f2a78584cd97f29b54d0007479bdcc3a01ba
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52828
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/rsaz_exp.c b/crypto/fipsmodule/bn/rsaz_exp.c
index 074f05d..f4e50a6 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.c
+++ b/crypto/fipsmodule/bn/rsaz_exp.c
@@ -66,23 +66,14 @@
   // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
 
   // table[0] = 1
-  rsaz_1024_mul_avx2(result, R2, one, m, k0);
   // table[1] = a_inv^1
+  rsaz_1024_mul_avx2(result, R2, one, m, k0);
   rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
-
   rsaz_1024_scatter5_avx2(table_s, result, 0);
   rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
-
   // table[2] = a_inv^2
   rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
   rsaz_1024_scatter5_avx2(table_s, result, 2);
-#if 0
-  // This is almost 2x smaller and less than 1% slower.
-  for (int index = 3; index < 32; index++) {
-    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-    rsaz_1024_scatter5_avx2(table_s, result, index);
-  }
-#else
   // table[4] = a_inv^4
   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
   rsaz_1024_scatter5_avx2(table_s, result, 4);
@@ -92,109 +83,25 @@
   // table[16] = a_inv^16
   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
   rsaz_1024_scatter5_avx2(table_s, result, 16);
-  // table[17] = a_inv^17
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 17);
+  for (int i = 3; i < 32; i += 2) {
+    // table[i] = table[i-1] * a_inv = a_inv^i
+    rsaz_1024_gather5_avx2(result, table_s, i - 1);
+    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+    rsaz_1024_scatter5_avx2(table_s, result, i);
+    for (int j = 2 * i; j < 32; j *= 2) {
+      // table[j] = table[j/2]^2 = a_inv^j
+      rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+      rsaz_1024_scatter5_avx2(table_s, result, j);
+    }
+  }
 
-  // table[3]
-  rsaz_1024_gather5_avx2(result, table_s, 2);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 3);
-  // table[6]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 6);
-  // table[12]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 12);
-  // table[24]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 24);
-  // table[25]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 25);
-
-  // table[5]
-  rsaz_1024_gather5_avx2(result, table_s, 4);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 5);
-  // table[10]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 10);
-  // table[20]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 20);
-  // table[21]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 21);
-
-  // table[7]
-  rsaz_1024_gather5_avx2(result, table_s, 6);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 7);
-  // table[14]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 14);
-  // table[28]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 28);
-  // table[29]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 29);
-
-  // table[9]
-  rsaz_1024_gather5_avx2(result, table_s, 8);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 9);
-  // table[18]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 18);
-  // table[19]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 19);
-
-  // table[11]
-  rsaz_1024_gather5_avx2(result, table_s, 10);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 11);
-  // table[22]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 22);
-  // table[23]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 23);
-
-  // table[13]
-  rsaz_1024_gather5_avx2(result, table_s, 12);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 13);
-  // table[26]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 26);
-  // table[27]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 27);
-
-  // table[15]
-  rsaz_1024_gather5_avx2(result, table_s, 14);
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 15);
-  // table[30]
-  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-  rsaz_1024_scatter5_avx2(table_s, result, 30);
-  // table[31]
-  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-  rsaz_1024_scatter5_avx2(table_s, result, 31);
-#endif
-
+  // Load the first window.
   const uint8_t *p_str = (const uint8_t *)exponent;
-
-  // load first window
   int wvalue = p_str[127] >> 3;
   rsaz_1024_gather5_avx2(result, table_s, wvalue);
 
   int index = 1014;
   while (index > -1) {  // Loop for the remaining 127 windows.
-
     rsaz_1024_sqr_avx2(result, result, m, k0, 5);
 
     uint16_t wvalue_16;