Use Barrett reduction in CBC processing rather than tricks.

Division isn't constant-time on Intel chips so the code was adding a
large multiple of md_size to try and force the operation to always take
the maximum amount of time.

I'm less convinced, these days, that compilers aren't going to get smart
enough to optimise that away so use Barrett reduction instead.

Change-Id: Ib8c514192682a2fcb4b1fb7e7c6dd1301d9888d0
Reviewed-on: https://boringssl-review.googlesource.com/6906
Reviewed-by: David Benjamin <davidben@chromium.org>
Reviewed-by: Adam Langley <alangley@gmail.com>
diff --git a/crypto/cipher/tls_cbc.c b/crypto/cipher/tls_cbc.c
index c541db3..e6aaabc 100644
--- a/crypto/cipher/tls_cbc.c
+++ b/crypto/cipher/tls_cbc.c
@@ -146,7 +146,6 @@
    * the MAC's position can only vary by 255 bytes. */
   unsigned scan_start = 0;
   unsigned i, j;
-  unsigned div_spoiler;
   unsigned rotate_offset;
 
   assert(orig_len >= in_len);
@@ -161,16 +160,46 @@
   if (orig_len > md_size + 255 + 1) {
     scan_start = orig_len - (md_size + 255 + 1);
   }
-  /* div_spoiler contains a multiple of md_size that is used to cause the
-   * modulo operation to be constant time. Without this, the time varies
-   * based on the amount of padding when running on Intel chips at least.
+
+  /* Ideally the next statement would be:
    *
-   * The aim of right-shifting md_size is so that the compiler doesn't
-   * figure out that it can remove div_spoiler as that would require it
-   * to prove that md_size is always even, which I hope is beyond it. */
-  div_spoiler = md_size >> 1;
-  div_spoiler <<= (sizeof(div_spoiler) - 1) * 8;
-  rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+   *   rotate_offset = (mac_start - scan_start) % md_size;
+   *
+   * However, division is not a constant-time operation (at least on Intel
+   * chips). Thus we enumerate the possible values of md_size and handle each
+   * separately. The value of |md_size| is public information (it's determined
+   * by the cipher suite in the ServerHello) so our timing can vary based on
+   * its value. */
+
+  rotate_offset = mac_start - scan_start;
+  /* rotate_offset can be, at most, 255 (bytes of padding) + 1 (padding length)
+   * + md_size = 256 + 48 (since SHA-384 is the largest hash) = 304. */
+  assert(rotate_offset <= 304);
+
+  if (md_size == 16) {
+    rotate_offset &= 15;
+  } else if (md_size == 20) {
+    /* 1/20 is approximated as 25/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 853. */
+    unsigned q = (rotate_offset * 25) >> 9;
+    rotate_offset -= q * 20;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 20), 20, 0);
+  } else if (md_size == 32) {
+    rotate_offset &= 31;
+  } else if (md_size == 48) {
+    /* 1/48 is approximated as 10/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 768. */
+    unsigned q = (rotate_offset * 10) >> 9;
+    rotate_offset -= q * 48;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 48), 48, 0);
+  } else {
+    /* This should be impossible therefore this path doesn't run in constant
+     * time. */
+    assert(0);
+    rotate_offset = rotate_offset % md_size;
+  }
 
   memset(rotated_mac, 0, md_size);
   for (i = scan_start, j = 0; i < orig_len; i++) {