Switch crypto/bn back to _umul128 on Windows clang.

Upstream (impressively quickly) fixed the missing intrinsic. Switch Windows
clang back to building the same code as MSVC. Also include the intrin.h header
rather than forward-declare the intrinsic. clang only works if the header is
explicitly included. Chromium forcibly includes it to work around these kinds
of issues, but we shouldn't rely on it.

BUG=crbug.com/438382

Change-Id: I0ff6d48e1a3aa455cff99f8dc4c407e88b84d446
Reviewed-on: https://boringssl-review.googlesource.com/2461
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/bn/internal.h b/crypto/bn/internal.h
index 573ddd7..e94f5ad 100644
--- a/crypto/bn/internal.h
+++ b/crypto/bn/internal.h
@@ -127,6 +127,11 @@
 
 #include <inttypes.h>
 
+#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && _MSC_VER >= 1400
+#include <intrin.h>
+#pragma intrinsic(__umulh, _umul128)
+#endif
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -239,8 +244,7 @@
   }
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
-/* Windows clang lacks _umul128, but inline asm works. */
-# if (defined(__GNUC__) && __GNUC__>=2) || defined(__clang__)
+# if defined(__GNUC__) && __GNUC__ >= 2
 #  define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	__asm__ ("mulq	%3"		\
@@ -253,13 +257,9 @@
 		: "=a"(low),"=d"(high)	\
 		: "a"(a),"g"(b)		\
 		: "cc");
-# elif defined(_MSC_VER) && _MSC_VER>=1400
-   unsigned __int64 __umulh	(unsigned __int64 a,unsigned __int64 b);
-   unsigned __int64 _umul128	(unsigned __int64 a,unsigned __int64 b,
-				 unsigned __int64 *h);
-#  pragma intrinsic(__umulh,_umul128)
-#  define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
-#  define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
+# elif defined(_MSC_VER) && _MSC_VER >= 1400
+#  define BN_UMULT_HIGH(a, b) __umulh((a), (b))
+#  define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
 # endif
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
 # if defined(__GNUC__) && __GNUC__>=2