Align rsaz and mont5 table construction.
Both implementations need to compute the first 32 powers of a. There's a
commented out naive version in rsaz_exp.c that claims to be smaller, but
1% slower. (It doesn't use squares when it otherwise could.)
Instead, we can write out the square-based strategy as a loop. (I wasn't
able to measure a difference between any of the three versions, but this
one's compact enough and does let us square more and gather5 less.)
Change-Id: I7015f2a78584cd97f29b54d0007479bdcc3a01ba
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52828
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/rsaz_exp.c b/crypto/fipsmodule/bn/rsaz_exp.c
index 074f05d..f4e50a6 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.c
+++ b/crypto/fipsmodule/bn/rsaz_exp.c
@@ -66,23 +66,14 @@
// R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
// table[0] = 1
- rsaz_1024_mul_avx2(result, R2, one, m, k0);
// table[1] = a_inv^1
+ rsaz_1024_mul_avx2(result, R2, one, m, k0);
rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
-
rsaz_1024_scatter5_avx2(table_s, result, 0);
rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
-
// table[2] = a_inv^2
rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 2);
-#if 0
- // This is almost 2x smaller and less than 1% slower.
- for (int index = 3; index < 32; index++) {
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, index);
- }
-#else
// table[4] = a_inv^4
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 4);
@@ -92,109 +83,25 @@
// table[16] = a_inv^16
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 16);
- // table[17] = a_inv^17
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 17);
+ for (int i = 3; i < 32; i += 2) {
+ // table[i] = table[i-1] * a_inv = a_inv^i
+ rsaz_1024_gather5_avx2(result, table_s, i - 1);
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s, result, i);
+ for (int j = 2 * i; j < 32; j *= 2) {
+ // table[j] = table[j/2]^2 = a_inv^j
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s, result, j);
+ }
+ }
- // table[3]
- rsaz_1024_gather5_avx2(result, table_s, 2);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 3);
- // table[6]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 6);
- // table[12]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 12);
- // table[24]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 24);
- // table[25]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 25);
-
- // table[5]
- rsaz_1024_gather5_avx2(result, table_s, 4);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 5);
- // table[10]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 10);
- // table[20]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 20);
- // table[21]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 21);
-
- // table[7]
- rsaz_1024_gather5_avx2(result, table_s, 6);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 7);
- // table[14]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 14);
- // table[28]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 28);
- // table[29]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 29);
-
- // table[9]
- rsaz_1024_gather5_avx2(result, table_s, 8);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 9);
- // table[18]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 18);
- // table[19]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 19);
-
- // table[11]
- rsaz_1024_gather5_avx2(result, table_s, 10);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 11);
- // table[22]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 22);
- // table[23]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 23);
-
- // table[13]
- rsaz_1024_gather5_avx2(result, table_s, 12);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 13);
- // table[26]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 26);
- // table[27]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 27);
-
- // table[15]
- rsaz_1024_gather5_avx2(result, table_s, 14);
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 15);
- // table[30]
- rsaz_1024_sqr_avx2(result, result, m, k0, 1);
- rsaz_1024_scatter5_avx2(table_s, result, 30);
- // table[31]
- rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- rsaz_1024_scatter5_avx2(table_s, result, 31);
-#endif
-
+ // Load the first window.
const uint8_t *p_str = (const uint8_t *)exponent;
-
- // load first window
int wvalue = p_str[127] >> 3;
rsaz_1024_gather5_avx2(result, table_s, wvalue);
int index = 1014;
while (index > -1) { // Loop for the remaining 127 windows.
-
rsaz_1024_sqr_avx2(result, result, m, k0, 5);
uint16_t wvalue_16;