Add an option to disable SSE2 intrinsics for testing.

We have some code which uses SSE2 intrinsics which, since they don't
have complicated build requirements, is enabled even with
OPENSSL_NO_ASM. x86_64 mandates SSE2 and people building for x86 tend to
mandate it anyway these days. This is great, but we still have generic
32-bit and 64-bit code configurations for other platforms.

32-bit generic code is covered by testing 32-bit ARM with NEON disabled.
However, 64-bit ARM always has NEON available, so we have no SIMD-less
64-bit platforms in our CI.

The immediate motivation is some bitsliced AES code I'm working on,
however I believe this also applies to the existing HRSS code. This also
fixes the HRSS feature checks to only look at __SSE2__, not __SSE__.
__SSE__ isn't sufficient and we don't compile if GCC or Clang is told
-msse -mno-sse2.

Change-Id: Iebb23f1664a2f62e0b4333e0e99f7d5f6c7f384d
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/39204
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7f468f..75bf998 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -458,6 +458,10 @@
   list(GET CMAKE_OSX_ARCHITECTURES 0 CMAKE_SYSTEM_PROCESSOR)
 endif()
 
+if(OPENSSL_NO_SSE2_FOR_TESTING)
+  add_definitions(-DOPENSSL_NO_SSE2_FOR_TESTING)
+endif()
+
 if(OPENSSL_NO_ASM)
   add_definitions(-DOPENSSL_NO_ASM)
   set(ARCH "generic")
diff --git a/crypto/fipsmodule/modes/gcm_nohw.c b/crypto/fipsmodule/modes/gcm_nohw.c
index 4dc3b27..f8618b8 100644
--- a/crypto/fipsmodule/modes/gcm_nohw.c
+++ b/crypto/fipsmodule/modes/gcm_nohw.c
@@ -17,7 +17,7 @@
 #include "../../internal.h"
 #include "internal.h"
 
-#if !defined(BORINGSSL_HAS_UINT128) && defined(__SSE2__)
+#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
 #include <emmintrin.h>
 #endif
 
@@ -79,7 +79,7 @@
             ((uint64_t)(extra >> 64));
 }
 
-#elif defined(__SSE2__)
+#elif defined(OPENSSL_SSE2)
 
 static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
   // One term every four bits means the largest term is 32/4 = 8, which does not
@@ -146,7 +146,7 @@
   memcpy(out_hi, ((char*)&ret) + 8, 8);
 }
 
-#else  // !BORINGSSL_HAS_UINT128 && !__SSE2__
+#else  // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2
 
 static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) {
   // One term every four bits means the largest term is 32/4 = 8, which does not
diff --git a/crypto/hrss/hrss.c b/crypto/hrss/hrss.c
index d81a43f..0f66e97 100644
--- a/crypto/hrss/hrss.c
+++ b/crypto/hrss/hrss.c
@@ -24,15 +24,6 @@
 #include <openssl/mem.h>
 #include <openssl/sha.h>
 
-#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-#include <emmintrin.h>
-#endif
-
-#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
-    (defined(__ARM_NEON__) || defined(__ARM_NEON))
-#include <arm_neon.h>
-#endif
-
 #if defined(_MSC_VER)
 #define RESTRICT
 #else
@@ -42,6 +33,15 @@
 #include "../internal.h"
 #include "internal.h"
 
+#if defined(OPENSSL_SSE2)
+#include <emmintrin.h>
+#endif
+
+#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+    (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#endif
+
 // This is an implementation of [HRSS], but with a KEM transformation based on
 // [SXY]. The primary references are:
 
@@ -63,22 +63,15 @@
 // 128-bit vector. The following functions abstract over the differences between
 // NEON and SSE2 for implementing some vector operations.
 
-// TODO: MSVC can likely also be made to work with vector operations.
-#if ((defined(__SSE__) && defined(OPENSSL_X86)) || defined(OPENSSL_X86_64)) && \
-    (defined(__clang__) || !defined(_MSC_VER))
+// TODO: MSVC can likely also be made to work with vector operations, but ^ must
+// be replaced with _mm_xor_si128, etc.
+#if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER))
 
 #define HRSS_HAVE_VECTOR_UNIT
 typedef __m128i vec_t;
 
 // vec_capable returns one iff the current platform supports SSE2.
-static int vec_capable(void) {
-#if defined(__SSE2__)
-  return 1;
-#else
-  int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0;
-  return has_sse2;
-#endif
-}
+static int vec_capable(void) { return 1; }
 
 // vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
 static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
diff --git a/crypto/internal.h b/crypto/internal.h
index 1fba5b6..b75f9af 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -200,6 +200,14 @@
 #define OPENSSL_FALLTHROUGH
 #endif
 
+// For convenience in testing 64-bit generic code, we allow disabling SSE2
+// intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. x86_64 always has SSE2
+// available, so we would otherwise need to test such code on a non-x86_64
+// platform.
+#if defined(__SSE2__) && !defined(OPENSSL_NO_SSE2_FOR_TESTING)
+#define OPENSSL_SSE2
+#endif
+
 // buffers_alias returns one if |a| and |b| alias and zero otherwise.
 static inline int buffers_alias(const uint8_t *a, size_t a_len,
                                 const uint8_t *b, size_t b_len) {