Replace cpuid assembly with C code.
Rather, take a leaf out of Chromium's book and use MSVC's __cpuid and
_xgetbv built-in, with an inline assembly emulated version for other
compilers.
This preserves the behavior of the original assembly with the following
differences:
- CPUs without cpuid aren't support. Chromium's base/cpu.cc doesn't
check, and SSE2 support is part of our baseline; the perlasm code
is always built with OPENSSL_IA32_SSE2.
- The clear_xmm block in cpu-x86-asm.pl is removed. This was used to
clear some XMM-using features if OSXSAVE was set but XCR0 reports the
OS doesn't use XSAVE to store SSE state. This wasn't present in the
x86_64 and seems wrong. Section 13.5.2 of the Intel manual, volume 1,
explicitly says SSE may still be used in this case; the OS may save
that state in FXSAVE instead. A side discussion on upstream's RT#2633
agrees.
- The old code ran some AMD CPUs through the "intel" codepath and some
went straight to "generic" after duplicating some, but not all, logic.
The AMD copy didn't clear some reserved bits and didn't query CPUID 7
for AVX2 support. This is moot since AMD CPUs today don't support
AVX2, but it seems they're expected to in the future?
- Setting bit 10 is dropped. This doesn't appear to be queried anywhere,
was 32-bit only, and seems a remnant of upstream's
14e21f863a3e3278bb8660ea9844e92e52e1f2f7.
Change-Id: I0548877c97e997f7beb25e15f3fea71c68a951d2
Reviewed-on: https://boringssl-review.googlesource.com/5434
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 6858cbb..b31dc00 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -57,7 +57,6 @@
set(
CRYPTO_ARCH_SOURCES
- cpu-x86_64-asm.${ASM_EXT}
cpu-intel.c
)
endif()
@@ -66,7 +65,6 @@
set(
CRYPTO_ARCH_SOURCES
- cpu-x86-asm.${ASM_EXT}
cpu-intel.c
)
endif()
@@ -230,6 +228,3 @@
)
target_link_libraries(refcount_test crypto)
-
-perlasm(cpu-x86_64-asm.${ASM_EXT} cpu-x86_64-asm.pl)
-perlasm(cpu-x86-asm.${ASM_EXT} cpu-x86-asm.pl)
diff --git a/crypto/cpu-intel.c b/crypto/cpu-intel.c
index df0e127..6943999 100644
--- a/crypto/cpu-intel.c
+++ b/crypto/cpu-intel.c
@@ -63,13 +63,62 @@
#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+#include <assert.h>
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
-/* OPENSSL_ia32_cpuid is defined in cpu-x86_64-asm.pl. */
-extern uint64_t OPENSSL_ia32_cpuid(uint32_t*);
+#if defined(OPENSSL_WINDOWS)
+#include <immintrin.h>
+#include <intrin.h>
+#endif
+
+
+/* OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX
+ * is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through
+ * |*out_edx|. */
+static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx,
+ uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) {
+#if defined(OPENSSL_WINDOWS)
+ int tmp[4];
+ __cpuid(tmp, (int)leaf);
+ *out_eax = (uint32_t)tmp[0];
+ *out_ebx = (uint32_t)tmp[1];
+ *out_ecx = (uint32_t)tmp[2];
+ *out_edx = (uint32_t)tmp[3];
+#elif defined(__pic__) && defined(OPENSSL_32_BIT)
+ /* Inline assembly may not clobber the PIC register. For 32-bit, this is EBX.
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. */
+ __asm__ volatile (
+ "xor %%ecx, %%ecx\n"
+ "mov %%ebx, %%edi\n"
+ "cpuid\n"
+ "xchg %%edi, %%ebx\n"
+ : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx)
+ : "a"(leaf)
+ );
+#else
+ __asm__ volatile (
+ "xor %%ecx, %%ecx\n"
+ "cpuid\n"
+ : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx)
+ : "a"(leaf)
+ );
+#endif
+}
+
+/* OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR).
+ * Currently only XCR0 is defined by Intel so |xcr| should always be zero. */
+static uint64_t OPENSSL_xgetbv(uint32_t xcr) {
+#if defined(OPENSSL_WINDOWS)
+ return (uint64_t)_xgetbv(xcr);
+#else
+ uint32_t eax, edx;
+ __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
+ return (((uint64_t)edx) << 32) | eax;
+#endif
+}
/* handle_cpu_env applies the value from |in| to the CPUID values in |out[0]|
* and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this. */
@@ -91,18 +140,122 @@
}
void OPENSSL_cpuid_setup(void) {
+ /* Determine the vendor and maximum input value. */
+ uint32_t eax, ebx, ecx, edx;
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0);
+
+ uint32_t num_ids = eax;
+
+ int is_intel = ebx == 0x756e6547 /* Genu */ &&
+ edx == 0x49656e69 /* ineI */ &&
+ ecx == 0x6c65746e /* ntel */;
+ int is_amd = ebx == 0x68747541 /* Auth */ &&
+ edx == 0x69746e65 /* enti */ &&
+ ecx == 0x444d4163 /* cAMD */;
+
+ int has_amd_xop = 0;
+ uint32_t num_amd_extended_ids = 0;
+ if (is_amd) {
+ /* AMD-specific logic.
+ * See http://developer.amd.com/wordpress/media/2012/10/254811.pdf */
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000000);
+ num_amd_extended_ids = eax;
+ if (num_amd_extended_ids >= 0x80000001) {
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000001);
+ if (ecx & (1 << 11)) {
+ has_amd_xop = 1;
+ }
+ }
+ }
+
+ uint32_t extended_features = 0;
+ if (num_ids >= 7) {
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7);
+ extended_features = ebx;
+ }
+
+ /* Query the main feature flags and adjust the hyper-threading bit.
+ *
+ * TODO(davidben): Can the AMD half of logic be trimmed down? I haven't found
+ * evidence that any current AMD CPUs share an L1 data cache between threads,
+ * and the CPUID manual (see section 3) suggests this code will always clear
+ * HTT on AMD. Only aes-586.pl queries this, so hopefully any future CPUs will
+ * use better implementations anyway. */
+ if (num_amd_extended_ids >= 0x80000008) {
+ /* See AMD CPUID manual (2.34), page 27. */
+ assert(is_amd);
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000008);
+ uint32_t num_physical_cores = 1 + (ecx & 0xff);
+
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
+
+ /* Correct the hyper-threading bit. */
+ if (edx & (1 << 28)) {
+ /* See AMD CPUID manual (2.34), pages 11 and 13. */
+ uint32_t num_logical_cores = (ebx >> 16) & 0xff;
+ if (num_logical_cores <= num_physical_cores) {
+ edx &= ~(1 << 28);
+ }
+ }
+ } else {
+ uint32_t cores_per_cache = 0;
+ if (num_ids >= 4) {
+ /* TODO(davidben): The Intel manual says this CPUID leaf enumerates all
+ * caches using ECX and doesn't say which is first. Does this matter? */
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 4);
+ cores_per_cache = 1 + ((eax >> 14) & 0xfff);
+ }
+
+ OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
+
+ /* Correct the hyper-threading bit if the data cache isn't shared between
+ * logical cores. */
+ if (edx & (1 << 28)) {
+ uint32_t num_logical_cores = (ebx >> 16) & 0xff;
+ if (cores_per_cache == 1 || num_logical_cores <= 1) {
+ edx &= ~(1 << 28);
+ }
+ }
+ }
+
+ /* Reserved bit #20 was historically repurposed to control the in-memory
+ * representation of RC4 state. Always set it to zero. */
+ edx &= ~(1 << 20);
+
+ /* Reserved bit #30 is repurposed to signal an Intel CPU. */
+ if (is_intel) {
+ edx |= (1 << 30);
+ } else {
+ edx &= ~(1 << 30);
+ }
+
+ /* The SDBG bit is repurposed to denote AMD XOP support. */
+ if (has_amd_xop) {
+ ecx |= (1 << 11);
+ } else {
+ ecx &= ~(1 << 11);
+ }
+
+ uint64_t xcr0 = 0;
+ if (ecx & (1 << 27)) {
+ /* XCR0 may only be queried if the OSXSAVE bit is set. */
+ xcr0 = OPENSSL_xgetbv(0);
+ }
+ /* See Intel manual, section 14.3. */
+ if ((xcr0 & 6) != 6) {
+ /* YMM registers cannot be used. */
+ ecx &= ~(1 << 28); /* AVX */
+ ecx &= ~(1 << 12); /* FMA */
+ ecx &= ~(1 << 11); /* AMD XOP */
+ extended_features &= ~(1 << 5); /* AVX2 */
+ }
+
+ OPENSSL_ia32cap_P[0] = edx;
+ OPENSSL_ia32cap_P[1] = ecx;
+ OPENSSL_ia32cap_P[2] = extended_features;
+ OPENSSL_ia32cap_P[3] = 0;
+
const char *env1, *env2;
-
-#if defined(OPENSSL_X86_64)
- OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P);
-#else
- uint64_t vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P);
- /* 1<<10 sets a reserved bit to indicate that the variable
- * was already initialised. */
- OPENSSL_ia32cap_P[0] = ((uint32_t)vec) | (1 << 10);
- OPENSSL_ia32cap_P[1] = vec >> 32;
-#endif
-
env1 = getenv("OPENSSL_ia32cap");
if (env1 == NULL) {
return;
diff --git a/crypto/cpu-x86-asm.pl b/crypto/cpu-x86-asm.pl
deleted file mode 100644
index 319c436..0000000
--- a/crypto/cpu-x86-asm.pl
+++ /dev/null
@@ -1,334 +0,0 @@
-#!/usr/bin/env perl
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC, "${dir}perlasm", "perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"crypto/cpu-x86-asm");
-
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-&function_begin("OPENSSL_ia32_cpuid");
- &xor ("edx","edx");
- &pushf ();
- &pop ("eax");
- &mov ("ecx","eax");
- &xor ("eax",1<<21);
- &push ("eax");
- &popf ();
- &pushf ();
- &pop ("eax");
- &xor ("ecx","eax");
- &xor ("eax","eax");
- &bt ("ecx",21);
- &jnc (&label("nocpuid"));
- &mov ("esi",&wparam(0));
- &mov (&DWP(8,"esi"),"eax"); # clear 3rd word
- &cpuid ();
- &mov ("edi","eax"); # max value for standard query level
-
- &xor ("eax","eax");
- &cmp ("ebx",0x756e6547); # "Genu"
- &setne (&LB("eax"));
- &mov ("ebp","eax");
- &cmp ("edx",0x49656e69); # "ineI"
- &setne (&LB("eax"));
- &or ("ebp","eax");
- &cmp ("ecx",0x6c65746e); # "ntel"
- &setne (&LB("eax"));
- &or ("ebp","eax"); # 0 indicates Intel CPU
- &jz (&label("intel"));
-
- &cmp ("ebx",0x68747541); # "Auth"
- &setne (&LB("eax"));
- &mov ("esi","eax");
- &cmp ("edx",0x69746E65); # "enti"
- &setne (&LB("eax"));
- &or ("esi","eax");
- &cmp ("ecx",0x444D4163); # "cAMD"
- &setne (&LB("eax"));
- &or ("esi","eax"); # 0 indicates AMD CPU
- &jnz (&label("intel"));
-
- # AMD specific
- &mov ("eax",0x80000000);
- &cpuid ();
- &cmp ("eax",0x80000001);
- &jb (&label("intel"));
- &mov ("esi","eax");
- &mov ("eax",0x80000001);
- &cpuid ();
- &or ("ebp","ecx");
- &and ("ebp",1<<11|1); # isolate XOP bit
- &cmp ("esi",0x80000008);
- &jb (&label("intel"));
-
- &mov ("eax",0x80000008);
- &cpuid ();
- &movz ("esi",&LB("ecx")); # number of cores - 1
- &inc ("esi"); # number of cores
-
- &mov ("eax",1);
- &xor ("ecx","ecx");
- &cpuid ();
- &bt ("edx",28);
- &jnc (&label("generic"));
- &shr ("ebx",16);
- &and ("ebx",0xff);
- &cmp ("ebx","esi");
- &ja (&label("generic"));
- &and ("edx",0xefffffff); # clear hyper-threading bit
- &jmp (&label("generic"));
-
-&set_label("intel");
- &cmp ("edi",7);
- &jb (&label("cacheinfo"));
-
- &mov ("esi",&wparam(0));
- &mov ("eax",7);
- &xor ("ecx","ecx");
- &cpuid ();
- &mov (&DWP(8,"esi"),"ebx");
-
-&set_label("cacheinfo");
- &cmp ("edi",4);
- &mov ("edi",-1);
- &jb (&label("nocacheinfo"));
-
- &mov ("eax",4);
- &mov ("ecx",0); # query L1D
- &cpuid ();
- &mov ("edi","eax");
- &shr ("edi",14);
- &and ("edi",0xfff); # number of cores -1 per L1D
-
-&set_label("nocacheinfo");
- &mov ("eax",1);
- &xor ("ecx","ecx");
- &cpuid ();
- &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
- &cmp ("ebp",0);
- &jne (&label("notintel"));
- &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
-&set_label("notintel");
- &bt ("edx",28); # test hyper-threading bit
- &jnc (&label("generic"));
- &and ("edx",0xefffffff);
- &cmp ("edi",0);
- &je (&label("generic"));
-
- &or ("edx",0x10000000);
- &shr ("ebx",16);
- &cmp (&LB("ebx"),1);
- &ja (&label("generic"));
- &and ("edx",0xefffffff); # clear hyper-threading bit if not
-
-&set_label("generic");
- &and ("ebp",1<<11); # isolate AMD XOP flag
- &and ("ecx",0xfffff7ff); # force 11th bit to 0
- &mov ("esi","edx");
- &or ("ebp","ecx"); # merge AMD XOP flag
-
- &bt ("ecx",27); # check OSXSAVE bit
- &jnc (&label("clear_avx"));
- &xor ("ecx","ecx");
- &data_byte(0x0f,0x01,0xd0); # xgetbv
- &and ("eax",6);
- &cmp ("eax",6);
- &je (&label("done"));
- &cmp ("eax",2);
- &je (&label("clear_avx"));
-&set_label("clear_xmm");
- &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits
- &and ("esi",0xfeffffff); # clear FXSR
-&set_label("clear_avx");
- &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
- &mov ("edi",&wparam(0));
- &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2
-&set_label("done");
- &mov ("eax","esi");
- &mov ("edx","ebp");
-&set_label("nocpuid");
-&function_end("OPENSSL_ia32_cpuid");
-
-&external_label("OPENSSL_ia32cap_P");
-
-&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
- &xor ("eax","eax");
- &xor ("edx","edx");
- &picmeup("ecx","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"ecx"),4);
- &jnc (&label("notsc"));
- &rdtsc ();
-&set_label("notsc");
- &ret ();
-&function_end_B("OPENSSL_rdtsc");
-
-# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host],
-# but it's safe to call it on any [supported] 32-bit platform...
-# Just check for [non-]zero return value...
-&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
- &picmeup("ecx","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"ecx"),4);
- &jnc (&label("nohalt")); # no TSC
-
- &data_word(0x9058900e); # push %cs; pop %eax
- &and ("eax",3);
- &jnz (&label("nohalt")); # not enough privileges
-
- &pushf ();
- &pop ("eax");
- &bt ("eax",9);
- &jnc (&label("nohalt")); # interrupts are disabled
-
- &rdtsc ();
- &push ("edx");
- &push ("eax");
- &halt ();
- &rdtsc ();
-
- &sub ("eax",&DWP(0,"esp"));
- &sbb ("edx",&DWP(4,"esp"));
- &add ("esp",8);
- &ret ();
-
-&set_label("nohalt");
- &xor ("eax","eax");
- &xor ("edx","edx");
- &ret ();
-&function_end_B("OPENSSL_instrument_halt");
-
-# Essentially there is only one use for this function. Under DJGPP:
-#
-# #include <go32.h>
-# ...
-# i=OPENSSL_far_spin(_dos_ds,0x46c);
-# ...
-# to obtain the number of spins till closest timer interrupt.
-
-&function_begin_B("OPENSSL_far_spin");
- &pushf ();
- &pop ("eax");
- &bt ("eax",9);
- &jnc (&label("nospin")); # interrupts are disabled
-
- &mov ("eax",&DWP(4,"esp"));
- &mov ("ecx",&DWP(8,"esp"));
- &data_word (0x90d88e1e); # push %ds, mov %eax,%ds
- &xor ("eax","eax");
- &mov ("edx",&DWP(0,"ecx"));
- &jmp (&label("spin"));
-
- &align (16);
-&set_label("spin");
- &inc ("eax");
- &cmp ("edx",&DWP(0,"ecx"));
- &je (&label("spin"));
-
- &data_word (0x1f909090); # pop %ds
- &ret ();
-
-&set_label("nospin");
- &xor ("eax","eax");
- &xor ("edx","edx");
- &ret ();
-&function_end_B("OPENSSL_far_spin");
-
-&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
- &xor ("eax","eax");
- &xor ("edx","edx");
- &picmeup("ecx","OPENSSL_ia32cap_P");
- &mov ("ecx",&DWP(0,"ecx"));
- &bt (&DWP(0,"ecx"),1);
- &jnc (&label("no_x87"));
- if ($sse2) {
- &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
- &cmp ("ecx",1<<26|1<<24);
- &jne (&label("no_sse2"));
- &pxor ("xmm0","xmm0");
- &pxor ("xmm1","xmm1");
- &pxor ("xmm2","xmm2");
- &pxor ("xmm3","xmm3");
- &pxor ("xmm4","xmm4");
- &pxor ("xmm5","xmm5");
- &pxor ("xmm6","xmm6");
- &pxor ("xmm7","xmm7");
- &set_label("no_sse2");
- }
- # just a bunch of fldz to zap the fp/mm bank followed by finit...
- &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b);
-&set_label("no_x87");
- &lea ("eax",&DWP(4,"esp"));
- &ret ();
-&function_end_B("OPENSSL_wipe_cpu");
-
-&function_begin_B("OPENSSL_atomic_add");
- &mov ("edx",&DWP(4,"esp")); # fetch the pointer, 1st arg
- &mov ("ecx",&DWP(8,"esp")); # fetch the increment, 2nd arg
- &push ("ebx");
- &nop ();
- &mov ("eax",&DWP(0,"edx"));
-&set_label("spin");
- &lea ("ebx",&DWP(0,"eax","ecx"));
- &nop ();
- &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx) # %eax is envolved and is always reloaded
- &jne (&label("spin"));
- &mov ("eax","ebx"); # OpenSSL expects the new value
- &pop ("ebx");
- &ret ();
-&function_end_B("OPENSSL_atomic_add");
-
-# This function can become handy under Win32 in situations when
-# we don't know which calling convention, __stdcall or __cdecl(*),
-# indirect callee is using. In C it can be deployed as
-#
-#ifdef OPENSSL_CPUID_OBJ
-# type OPENSSL_indirect_call(void *f,...);
-# ...
-# OPENSSL_indirect_call(func,[up to $max arguments]);
-#endif
-#
-# (*) it's designed to work even for __fastcall if number of
-# arguments is 1 or 2!
-&function_begin_B("OPENSSL_indirect_call");
- {
- my ($max,$i)=(7,); # $max has to be chosen as 4*n-1
- # in order to preserve eventual
- # stack alignment
- &push ("ebp");
- &mov ("ebp","esp");
- &sub ("esp",$max*4);
- &mov ("ecx",&DWP(12,"ebp"));
- &mov (&DWP(0,"esp"),"ecx");
- &mov ("edx",&DWP(16,"ebp"));
- &mov (&DWP(4,"esp"),"edx");
- for($i=2;$i<$max;$i++)
- {
- # Some copies will be redundant/bogus...
- &mov ("eax",&DWP(12+$i*4,"ebp"));
- &mov (&DWP(0+$i*4,"esp"),"eax");
- }
- &call_ptr (&DWP(8,"ebp"));# make the call...
- &mov ("esp","ebp"); # ... and just restore the stack pointer
- # without paying attention to what we called,
- # (__cdecl *func) or (__stdcall *one).
- &pop ("ebp");
- &ret ();
- }
-&function_end_B("OPENSSL_indirect_call");
-
-&function_begin_B("OPENSSL_ia32_rdrand");
- &mov ("ecx",8);
-&set_label("loop");
- &rdrand ("eax");
- &jc (&label("break"));
- &loop (&label("loop"));
-&set_label("break");
- &cmp ("eax",0);
- &cmove ("eax","ecx");
- &ret ();
-&function_end_B("OPENSSL_ia32_rdrand");
-
-&hidden("OPENSSL_ia32cap_P");
-
-&asm_finish();
diff --git a/crypto/cpu-x86_64-asm.pl b/crypto/cpu-x86_64-asm.pl
deleted file mode 100644
index 53d4774..0000000
--- a/crypto/cpu-x86_64-asm.pl
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env perl
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-
-print<<___;
-.text
-
-.globl OPENSSL_ia32_cpuid
-.type OPENSSL_ia32_cpuid,\@function,1
-.align 16
-OPENSSL_ia32_cpuid:
- # On Windows, $arg1 is rcx, but that will be clobbered. So make Windows
- # use the same register as Unix.
- mov $arg1,%rdi
- mov %rbx,%r8 # save %rbx
-
- xor %eax,%eax
- mov %eax,8(%rdi) # clear 3rd word
- cpuid
- mov %eax,%r11d # max value for standard query level
-
- xor %eax,%eax
- cmp \$0x756e6547,%ebx # "Genu"
- setne %al
- mov %eax,%r9d
- cmp \$0x49656e69,%edx # "ineI"
- setne %al
- or %eax,%r9d
- cmp \$0x6c65746e,%ecx # "ntel"
- setne %al
- or %eax,%r9d # 0 indicates Intel CPU
- jz .Lintel
-
- cmp \$0x68747541,%ebx # "Auth"
- setne %al
- mov %eax,%r10d
- cmp \$0x69746E65,%edx # "enti"
- setne %al
- or %eax,%r10d
- cmp \$0x444D4163,%ecx # "cAMD"
- setne %al
- or %eax,%r10d # 0 indicates AMD CPU
- jnz .Lintel
-
- # AMD specific
- # See http://developer.amd.com/wordpress/media/2012/10/254811.pdf (1)
-
- mov \$0x80000000,%eax
- cpuid
- # Returns "The largest CPUID extended function input value supported by
- # the processor implementation." in EAX.
- cmp \$0x80000001,%eax
- jb .Lintel
- mov %eax,%r10d
- mov \$0x80000001,%eax
- cpuid
- # Returns feature bits in ECX. See page 20 of [1].
- or %ecx,%r9d
- and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
-
- cmp \$0x80000008,%r10d
- jb .Lintel
-
- mov \$0x80000008,%eax
- cpuid
- # Returns APIC ID and number of cores in ECX. See page 27 of [1].
- movzb %cl,%r10 # number of cores - 1
- inc %r10 # number of cores
-
- mov \$1,%eax
- cpuid
- # See page 13 of [1].
- bt \$28,%edx # test hyper-threading bit
- jnc .Lgeneric
- shr \$16,%ebx # number of logical processors
- cmp %r10b,%bl
- ja .Lgeneric
- and \$0xefffffff,%edx # Clear hyper-threading bit.
- jmp .Lgeneric
-
-.Lintel:
- cmp \$4,%r11d
- mov \$-1,%r10d
- jb .Lnocacheinfo
-
- mov \$4,%eax
- mov \$0,%ecx # query L1D
- cpuid
- mov %eax,%r10d
- shr \$14,%r10d
- and \$0xfff,%r10d # number of cores -1 per L1D
-
- cmp \$7,%r11d
- jb .Lnocacheinfo
-
- mov \$7,%eax
- xor %ecx,%ecx
- cpuid
- mov %ebx,8(%rdi)
-
-.Lnocacheinfo:
- mov \$1,%eax
- cpuid
- # Gets feature information. See table 3-21 in the Intel manual.
- and \$0xbfefffff,%edx # force reserved bits to 0
- cmp \$0,%r9d
- jne .Lnotintel
- or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs
-.Lnotintel:
- bt \$28,%edx # test hyper-threading bit
- jnc .Lgeneric
- and \$0xefffffff,%edx # ~(1<<28) - clear hyper-threading.
- cmp \$0,%r10d
- je .Lgeneric
-
- or \$0x10000000,%edx # 1<<28
- shr \$16,%ebx
- cmp \$1,%bl # see if cache is shared
- ja .Lgeneric
- and \$0xefffffff,%edx # ~(1<<28)
-.Lgeneric:
- and \$0x00000800,%r9d # isolate AMD XOP flag
- and \$0xfffff7ff,%ecx
- or %ecx,%r9d # merge AMD XOP flag
-
- mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx
- bt \$27,%r9d # check OSXSAVE bit
- jnc .Lclear_avx
- xor %ecx,%ecx # XCR0
- .byte 0x0f,0x01,0xd0 # xgetbv
- and \$6,%eax # isolate XMM and YMM state support
- cmp \$6,%eax
- je .Ldone
-.Lclear_avx:
- mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
- and %eax,%r9d # clear AVX, FMA and AMD XOP bits
- andl \$0xffffffdf,8(%rdi) # cleax AVX2, ~(1<<5)
-.Ldone:
- movl %r9d,4(%rdi)
- movl %r10d,0(%rdi)
- mov %r8,%rbx # restore %rbx
- ret
-.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
-
-___
-
-close STDOUT; # flush
diff --git a/include/openssl/cpu.h b/include/openssl/cpu.h
index 08dbae0..981d246 100644
--- a/include/openssl/cpu.h
+++ b/include/openssl/cpu.h
@@ -89,8 +89,7 @@
* Index 3 is set to zero.
*
* Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM
- * bits in XCR0, so it is not necessary to check those. However, instructions
- * requiring XMM support are not preadjusted for the FXSR bit. */
+ * bits in XCR0, so it is not necessary to check those. */
extern uint32_t OPENSSL_ia32cap_P[4];
#endif