Align rsaz and mont5 table construction. Both implementations need to compute the first 32 powers of a. There's a commented out naive version in rsaz_exp.c that claims to be smaller, but 1% slower. (It doesn't use squares when it otherwise could.) Instead, we can write out the square-based strategy as a loop. (I wasn't able to measure a difference between any of the three versions, but this one's compact enough and does let us square more and gather5 less.) Change-Id: I7015f2a78584cd97f29b54d0007479bdcc3a01ba Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52828 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/rsaz_exp.c b/crypto/fipsmodule/bn/rsaz_exp.c index 074f05d..f4e50a6 100644 --- a/crypto/fipsmodule/bn/rsaz_exp.c +++ b/crypto/fipsmodule/bn/rsaz_exp.c
@@ -66,23 +66,14 @@ // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2 // table[0] = 1 - rsaz_1024_mul_avx2(result, R2, one, m, k0); // table[1] = a_inv^1 + rsaz_1024_mul_avx2(result, R2, one, m, k0); rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 0); rsaz_1024_scatter5_avx2(table_s, a_inv, 1); - // table[2] = a_inv^2 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); rsaz_1024_scatter5_avx2(table_s, result, 2); -#if 0 - // This is almost 2x smaller and less than 1% slower. - for (int index = 3; index < 32; index++) { - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, index); - } -#else // table[4] = a_inv^4 rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s, result, 4); @@ -92,109 +83,25 @@ // table[16] = a_inv^16 rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s, result, 16); - // table[17] = a_inv^17 - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 17); + for (int i = 3; i < 32; i += 2) { + // table[i] = table[i-1] * a_inv = a_inv^i + rsaz_1024_gather5_avx2(result, table_s, i - 1); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, i); + for (int j = 2 * i; j < 32; j *= 2) { + // table[j] = table[j/2]^2 = a_inv^j + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, j); + } + } - // table[3] - rsaz_1024_gather5_avx2(result, table_s, 2); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 3); - // table[6] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 6); - // table[12] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 12); - // table[24] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 24); - // table[25] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 25); - - // table[5] - rsaz_1024_gather5_avx2(result, table_s, 4); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 5); - // table[10] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 10); - // table[20] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 20); - // table[21] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 21); - - // table[7] - rsaz_1024_gather5_avx2(result, table_s, 6); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 7); - // table[14] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 14); - // table[28] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 28); - // table[29] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 29); - - // table[9] - rsaz_1024_gather5_avx2(result, table_s, 8); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 9); - // table[18] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 18); - // table[19] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 19); - - // table[11] - rsaz_1024_gather5_avx2(result, table_s, 10); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 11); - // table[22] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 22); - // table[23] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 23); - - // table[13] - rsaz_1024_gather5_avx2(result, table_s, 12); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 13); - // table[26] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 26); - // table[27] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 27); - - // table[15] - rsaz_1024_gather5_avx2(result, table_s, 14); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 15); - // table[30] - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 30); - // table[31] - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 31); -#endif - + // Load the first window. const uint8_t *p_str = (const uint8_t *)exponent; - - // load first window int wvalue = p_str[127] >> 3; rsaz_1024_gather5_avx2(result, table_s, wvalue); int index = 1014; while (index > -1) { // Loop for the remaining 127 windows. - rsaz_1024_sqr_avx2(result, result, m, k0, 5); uint16_t wvalue_16;