Add a value barrier to constant-time selects.

Clang recognizes the (mask & a) | (~mask & b) pattern as a select. While
it often optimizes this into a cmov, it sometimes inserts branches
instead, particularly when it detects a string of cmovs with the same
condition.

In the long term, we need language-level support for expressing our
constraints. In the short term, introduce value barriers to prevent the
compiler from reasoning about our bit tricks. Thanks to Chandler Carruth
for suggesting this pattern. It should be reasonably robust, short of
value-based PGO or the compiler learning to reason about empty inline
assembly blocks.

Apply barriers to our various constant-time selects. We should invest
more in the valgrind-based tooling to figure out if there are other
instances.

Change-Id: Icc24ce36a61f7fec021a762c27197b9c5bd28c5d
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/36484
Reviewed-by: Chandler Carruth <chandlerc@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/constant_time_test.cc b/crypto/constant_time_test.cc
index 59a7bb1..ae80003 100644
--- a/crypto/constant_time_test.cc
+++ b/crypto/constant_time_test.cc
@@ -153,3 +153,19 @@
     }
   }
 }
+
+TEST(ConstantTimeTest, ValueBarrier) {
+  for (int i = 0; i < 10; i++) {
+    crypto_word_t word;
+    RAND_bytes(reinterpret_cast<uint8_t *>(&word), sizeof(word));
+    EXPECT_EQ(word, value_barrier_w(word));
+
+    uint32_t u32;
+    RAND_bytes(reinterpret_cast<uint8_t *>(&u32), sizeof(u32));
+    EXPECT_EQ(u32, value_barrier_u32(u32));
+
+    uint64_t u64;
+    RAND_bytes(reinterpret_cast<uint8_t *>(&u64), sizeof(u64));
+    EXPECT_EQ(u64, value_barrier_u64(u64));
+  }
+}
diff --git a/crypto/internal.h b/crypto/internal.h
index b80f065..76a317a 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -240,6 +240,36 @@
 #define CONSTTIME_TRUE_8 ((uint8_t)0xff)
 #define CONSTTIME_FALSE_8 ((uint8_t)0)
 
+// value_barrier_w returns |a|, but prevents GCC and Clang from reasoning about
+// the returned value. This is used to mitigate compilers undoing constant-time
+// code, until we can express our requirements directly in the language.
+//
+// Note the compiler is aware that |value_barrier_w| has no side effects and
+// always has the same output for a given input. This allows it to eliminate
+// dead code, move computations across loops, and vectorize.
+static inline crypto_word_t value_barrier_w(crypto_word_t a) {
+#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+  __asm__("" : "+r"(a) : /* no inputs */);
+#endif
+  return a;
+}
+
+// value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|.
+static inline uint32_t value_barrier_u32(uint32_t a) {
+#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+  __asm__("" : "+r"(a) : /* no inputs */);
+#endif
+  return a;
+}
+
+// value_barrier_u64 behaves like |value_barrier_w| but takes a |uint64_t|.
+static inline uint64_t value_barrier_u64(uint64_t a) {
+#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+  __asm__("" : "+r"(a) : /* no inputs */);
+#endif
+  return a;
+}
+
 // constant_time_msb_w returns the given value with the MSB copied to all the
 // other bits.
 static inline crypto_word_t constant_time_msb_w(crypto_word_t a) {
@@ -352,7 +382,13 @@
 static inline crypto_word_t constant_time_select_w(crypto_word_t mask,
                                                    crypto_word_t a,
                                                    crypto_word_t b) {
-  return (mask & a) | (~mask & b);
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  //
+  // Adding barriers to both |mask| and |~mask| breaks the relationship between
+  // the two, which makes the compiler stick with bitmasks.
+  return (value_barrier_w(mask) & a) | (value_barrier_w(~mask) & b);
 }
 
 // constant_time_select_8 acts like |constant_time_select| but operates on
diff --git a/third_party/fiat/curve25519_32.h b/third_party/fiat/curve25519_32.h
index 820a5c9..5377242 100644
--- a/third_party/fiat/curve25519_32.h
+++ b/third_party/fiat/curve25519_32.h
@@ -90,7 +90,13 @@
 static void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
   fiat_25519_uint1 x1 = (!(!arg1));
   uint32_t x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff));
-  uint32_t x3 = ((x2 & arg3) | ((~x2) & arg2));
+  // Note this line has been patched from the synthesized code to add value
+  // barriers.
+  //
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  uint32_t x3 = ((value_barrier_u32(x2) & arg3) | (value_barrier_u32(~x2) & arg2));
   *out1 = x3;
 }
 
diff --git a/third_party/fiat/curve25519_64.h b/third_party/fiat/curve25519_64.h
index 23bf361..7c31ff9 100644
--- a/third_party/fiat/curve25519_64.h
+++ b/third_party/fiat/curve25519_64.h
@@ -58,7 +58,13 @@
 static void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
   fiat_25519_uint1 x1 = (!(!arg1));
   uint64_t x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
-  uint64_t x3 = ((x2 & arg3) | ((~x2) & arg2));
+  // Note this line has been patched from the synthesized code to add value
+  // barriers.
+  //
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  uint64_t x3 = ((value_barrier_u64(x2) & arg3) | (value_barrier_u64(~x2) & arg2));
   *out1 = x3;
 }
 
diff --git a/third_party/fiat/p256_32.h b/third_party/fiat/p256_32.h
index faaa0b0..638eb5d 100644
--- a/third_party/fiat/p256_32.h
+++ b/third_party/fiat/p256_32.h
@@ -77,7 +77,13 @@
 static void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
   fiat_p256_uint1 x1 = (!(!arg1));
   uint32_t x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff));
-  uint32_t x3 = ((x2 & arg3) | ((~x2) & arg2));
+  // Note this line has been patched from the synthesized code to add value
+  // barriers.
+  //
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  uint32_t x3 = ((value_barrier_u32(x2) & arg3) | (value_barrier_u32(~x2) & arg2));
   *out1 = x3;
 }
 
diff --git a/third_party/fiat/p256_64.h b/third_party/fiat/p256_64.h
index 8e449c6..7d97e0a 100644
--- a/third_party/fiat/p256_64.h
+++ b/third_party/fiat/p256_64.h
@@ -79,7 +79,13 @@
 static void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
   fiat_p256_uint1 x1 = (!(!arg1));
   uint64_t x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
-  uint64_t x3 = ((x2 & arg3) | ((~x2) & arg2));
+  // Note this line has been patched from the synthesized code to add value
+  // barriers.
+  //
+  // Clang recognizes this pattern as a select. While it usually transforms it
+  // to a cmov, it sometimes further transforms it into a branch, which we do
+  // not want.
+  uint64_t x3 = ((value_barrier_u64(x2) & arg3) | (value_barrier_u64(~x2) & arg2));
   *out1 = x3;
 }