Enable __asm__ and uint128_t code in clang-cl.

It actually works fine. I just forgot one of the typedefs last time.
This gives a roughly 2x improvement on P-256 in clang-cl +
OPENSSL_SMALL, the configuration used by Chrome.

Before:
Did 1302 ECDH P-256 operations in 1015000us (1282.8 ops/sec)
Did 4250 ECDSA P-256 signing operations in 1047000us (4059.2 ops/sec)
Did 1750 ECDSA P-256 verify operations in 1094000us (1599.6 ops/sec)

After:
Did 3250 ECDH P-256 operations in 1078000us (3014.8 ops/sec)
Did 8250 ECDSA P-256 signing operations in 1016000us (8120.1 ops/sec)
Did 3250 ECDSA P-256 verify operations in 1063000us (3057.4 ops/sec)

(These were taken on a VM, so the measurements are extremely noisy, but
this sort of improvement is visible regardless.)

Alas, we do need a little extra bit of fiddling because division does
not work (crbug.com/787617).

Bug: chromium:787617
Update-Note: This removes the MSan uint128_t workaround which does not
    appear to be necessary anymore.
Change-Id: I8361314608521e5bdaf0e7eeae7a02c33f55c69f
Reviewed-on: https://boringssl-review.googlesource.com/23984
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/fipsmodule/bn/asm/x86_64-gcc.c b/crypto/fipsmodule/bn/asm/x86_64-gcc.c
index 49351c1..30fff21 100644
--- a/crypto/fipsmodule/bn/asm/x86_64-gcc.c
+++ b/crypto/fipsmodule/bn/asm/x86_64-gcc.c
@@ -52,8 +52,9 @@
 
 #include <openssl/bn.h>
 
-// TODO(davidben): Get this file working on Windows x64.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
+// TODO(davidben): Get this file working on MSVC x64.
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
 
 #include "../internal.h"
 
@@ -537,4 +538,4 @@
 #undef mul_add_c2
 #undef sqr_add_c2
 
-#endif  // !NO_ASM && X86_64 && __GNUC__
+#endif  // !NO_ASM && X86_64 && (__GNUC__ || __clang__)
diff --git a/crypto/fipsmodule/bn/div.c b/crypto/fipsmodule/bn/div.c
index c92eab3..7f261f1 100644
--- a/crypto/fipsmodule/bn/div.c
+++ b/crypto/fipsmodule/bn/div.c
@@ -155,18 +155,18 @@
   //
   // These issues aren't specific to x86 and x86_64, so it might be worthwhile
   // to add more assembly language implementations.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__GNUC__)
-  __asm__ volatile (
-    "divl %4"
-    : "=a"(*quotient_out), "=d"(*rem_out)
-    : "a"(n1), "d"(n0), "rm"(d0)
-    : "cc" );
-#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
-  __asm__ volatile (
-    "divq %4"
-    : "=a"(*quotient_out), "=d"(*rem_out)
-    : "a"(n1), "d"(n0), "rm"(d0)
-    : "cc" );
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && \
+    (defined(__GNUC__) || defined(__clang__))
+  __asm__ volatile("divl %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
+  __asm__ volatile("divq %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
 #else
 #if defined(BN_ULLONG)
   BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
@@ -617,7 +617,7 @@
 }
 
 BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
-#ifndef BN_ULLONG
+#ifndef BN_CAN_DIVIDE_ULLONG
   BN_ULONG ret = 0;
 #else
   BN_ULLONG ret = 0;
@@ -628,9 +628,9 @@
     return (BN_ULONG) -1;
   }
 
-#ifndef BN_ULLONG
-  // If |w| is too long and we don't have |BN_ULLONG| then we need to fall back
-  // to using |BN_div_word|.
+#ifndef BN_CAN_DIVIDE_ULLONG
+  // If |w| is too long and we don't have |BN_ULLONG| division then we need to
+  // fall back to using |BN_div_word|.
   if (w > ((BN_ULONG)1 << BN_BITS4)) {
     BIGNUM *tmp = BN_dup(a);
     if (tmp == NULL) {
@@ -643,7 +643,7 @@
 #endif
 
   for (i = a->top - 1; i >= 0; i--) {
-#ifndef BN_ULLONG
+#ifndef BN_CAN_DIVIDE_ULLONG
     ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
     ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
 #else
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index a39a033..ee80a3c 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c
@@ -64,7 +64,8 @@
 // This file has two other implementations: x86 assembly language in
 // asm/bn-586.pl and x86_64 inline assembly in asm/x86_64-gcc.c.
 #if defined(OPENSSL_NO_ASM) || \
-    !(defined(OPENSSL_X86) || (defined(OPENSSL_X86_64) && defined(__GNUC__)))
+    !(defined(OPENSSL_X86) ||  \
+      (defined(OPENSSL_X86_64) && (defined(__GNUC__) || defined(__clang__))))
 
 #ifdef BN_ULLONG
 #define mul_add(r, a, w, c)               \
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 75efbfa..706e544 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -140,9 +140,12 @@
 
 #if defined(OPENSSL_64_BIT)
 
-#if !defined(_MSC_VER)
+#if defined(BORINGSSL_HAS_UINT128)
 // MSVC doesn't support two-word integers on 64-bit.
 #define BN_ULLONG uint128_t
+#if defined(BORINGSSL_CAN_DIVIDE_UINT128)
+#define BN_CAN_DIVIDE_ULLONG
+#endif
 #endif
 
 #define BN_BITS2 64
@@ -160,6 +163,7 @@
 #elif defined(OPENSSL_32_BIT)
 
 #define BN_ULLONG uint64_t
+#define BN_CAN_DIVIDE_ULLONG
 #define BN_BITS2 32
 #define BN_BYTES 4
 #define BN_BITS4 16
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index ed54554..47a90ce 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -246,18 +246,11 @@
   out->curves[2].param_len = 32;
   out->curves[2].params = kP256Params;
   out->curves[2].method =
-// MSan appears to have a bug that causes code to be miscompiled in opt mode.
-// While that is being looked at, don't run the uint128_t code under MSan.
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
-    !defined(OPENSSL_SMALL) && !defined(MEMORY_SANITIZER)
+    !defined(OPENSSL_SMALL)
       EC_GFp_nistz256_method();
 #else
-#if defined(OPENSSL_32_BIT) || \
-    (defined(OPENSSL_64_BIT) && !defined(MEMORY_SANITIZER))
       EC_GFp_nistp256_method();
-#else
-      EC_GFp_mont_method();
-#endif
 #endif
 
   // 1.3.132.0.33
@@ -269,8 +262,7 @@
   out->curves[3].param_len = 28;
   out->curves[3].params = kP224Params;
   out->curves[3].method =
-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
-    !defined(MEMORY_SANITIZER) && !defined(OPENSSL_SMALL)
+#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
       EC_GFp_nistp224_method();
 #else
       EC_GFp_mont_method();
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c
index d0285d6..0097209 100644
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -19,9 +19,6 @@
 
 #include <openssl/base.h>
 
-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
-    !defined(OPENSSL_SMALL)
-
 #include <openssl/bn.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
@@ -34,6 +31,8 @@
 #include "../../internal.h"
 
 
+#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
+
 // Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
 // using 64-bit coefficients called 'limbs', and sometimes (for multiplication
 // results) as b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 +
@@ -1129,4 +1128,4 @@
   out->field_decode = NULL;
 };
 
-#endif  // 64_BIT && !WINDOWS && !SMALL
+#endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/crypto/internal.h b/crypto/internal.h
index 76d39b7..5706414 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -151,9 +151,16 @@
 #endif
 
 
-#if !defined(_MSC_VER) && defined(OPENSSL_64_BIT)
+#if (!defined(_MSC_VER) || defined(__clang__)) && defined(OPENSSL_64_BIT)
+#define BORINGSSL_HAS_UINT128
 typedef __int128_t int128_t;
 typedef __uint128_t uint128_t;
+
+// clang-cl supports __uint128_t but modulus and division don't work.
+// https://crbug.com/787617.
+#if !defined(_MSC_VER) || !defined(__clang__)
+#define BORINGSSL_CAN_DIVIDE_UINT128
+#endif
 #endif
 
 #define OPENSSL_ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index 19a8284..25ef383 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -29,11 +29,6 @@
 
 #include <openssl/base.h>
 
-// MSVC does not implement uint128_t, and crashes with intrinsics
-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS)
-#define BORINGSSL_NISTP256_64BIT 1
-#endif
-
 #include <openssl/bn.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
@@ -42,10 +37,15 @@
 #include <string.h>
 
 #include "../../crypto/fipsmodule/delocate.h"
-#include "../../crypto/internal.h"
 #include "../../crypto/fipsmodule/ec/internal.h"
+#include "../../crypto/internal.h"
 
 
+// MSVC does not implement uint128_t, and crashes with intrinsics
+#if defined(BORINGSSL_HAS_UINT128)
+#define BORINGSSL_NISTP256_64BIT 1
+#endif
+
 // "intrinsics"
 
 #if defined(BORINGSSL_NISTP256_64BIT)