Use Barrett reduction in CBC processing rather than tricks. Division isn't constant-time on Intel chips so the code was adding a large multiple of md_size to try and force the operation to always take the maximum amount of time. I'm less convinced, these days, that compilers aren't going to get smart enough to optimise that away so use Barrett reduction instead. Change-Id: Ib8c514192682a2fcb4b1fb7e7c6dd1301d9888d0 Reviewed-on: https://boringssl-review.googlesource.com/6906 Reviewed-by: David Benjamin <davidben@chromium.org> Reviewed-by: Adam Langley <alangley@gmail.com>

commit: 54a8d7c14f321d7a8faeab64873ac45c588da3eb [log] [tgz]
author: Adam Langley <alangley@gmail.com> Fri Jan 15 11:16:41 2016 -0800
committer: Adam Langley <alangley@gmail.com> Wed Jan 27 22:05:01 2016 +0000
tree: 98e584bd155f0270b780dc8830c86efe211e8cf1
parent: 91b2501f02e9cfa1643f90b5dfd3ae397b75922b [diff]
diff --git a/crypto/cipher/tls_cbc.c b/crypto/cipher/tls_cbc.c
index c541db3..e6aaabc 100644
--- a/crypto/cipher/tls_cbc.c
+++ b/crypto/cipher/tls_cbc.c

@@ -146,7 +146,6 @@
    * the MAC's position can only vary by 255 bytes. */
   unsigned scan_start = 0;
   unsigned i, j;
-  unsigned div_spoiler;
   unsigned rotate_offset;
 
   assert(orig_len >= in_len);
@@ -161,16 +160,46 @@
   if (orig_len > md_size + 255 + 1) {
     scan_start = orig_len - (md_size + 255 + 1);
   }
-  /* div_spoiler contains a multiple of md_size that is used to cause the
-   * modulo operation to be constant time. Without this, the time varies
-   * based on the amount of padding when running on Intel chips at least.
+
+  /* Ideally the next statement would be:
    *
-   * The aim of right-shifting md_size is so that the compiler doesn't
-   * figure out that it can remove div_spoiler as that would require it
-   * to prove that md_size is always even, which I hope is beyond it. */
-  div_spoiler = md_size >> 1;
-  div_spoiler <<= (sizeof(div_spoiler) - 1) * 8;
-  rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+   *   rotate_offset = (mac_start - scan_start) % md_size;
+   *
+   * However, division is not a constant-time operation (at least on Intel
+   * chips). Thus we enumerate the possible values of md_size and handle each
+   * separately. The value of |md_size| is public information (it's determined
+   * by the cipher suite in the ServerHello) so our timing can vary based on
+   * its value. */
+
+  rotate_offset = mac_start - scan_start;
+  /* rotate_offset can be, at most, 255 (bytes of padding) + 1 (padding length)
+   * + md_size = 256 + 48 (since SHA-384 is the largest hash) = 304. */
+  assert(rotate_offset <= 304);
+
+  if (md_size == 16) {
+    rotate_offset &= 15;
+  } else if (md_size == 20) {
+    /* 1/20 is approximated as 25/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 853. */
+    unsigned q = (rotate_offset * 25) >> 9;
+    rotate_offset -= q * 20;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 20), 20, 0);
+  } else if (md_size == 32) {
+    rotate_offset &= 31;
+  } else if (md_size == 48) {
+    /* 1/48 is approximated as 10/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 768. */
+    unsigned q = (rotate_offset * 10) >> 9;
+    rotate_offset -= q * 48;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 48), 48, 0);
+  } else {
+    /* This should be impossible therefore this path doesn't run in constant
+     * time. */
+    assert(0);
+    rotate_offset = rotate_offset % md_size;
+  }
 
   memset(rotated_mac, 0, md_size);
   for (i = scan_start, j = 0; i < orig_len; i++) {
commit	54a8d7c14f321d7a8faeab64873ac45c588da3eb	[log] [tgz]
author	Adam Langley <alangley@gmail.com>	Fri Jan 15 11:16:41 2016 -0800
committer	Adam Langley <alangley@gmail.com>	Wed Jan 27 22:05:01 2016 +0000
tree	98e584bd155f0270b780dc8830c86efe211e8cf1
parent	91b2501f02e9cfa1643f90b5dfd3ae397b75922b [diff]