Add ASM optimizations for Windows on Arm

Windows on Arm (WoA) builds are currently using the C implementations
of the various functions within BoringSSL. This patch enables feature
detection for the Neon and hardware crypto optimizations, and updates
the perl script to generate AArch64 .S files for WoA.

Note these files use GNU assembler syntax (specifically tested with
Clang assembler), not armasm.

Change-Id: Id8841f4db0498ec16215095a4e6bd60d427cd54b
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/43304
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 144516d..f58e853 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -484,6 +484,8 @@
   set(ARCH "x86")
 elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
   set(ARCH "aarch64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ARM64")
+  set(ARCH "aarch64")
 elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64")
   set(ARCH "aarch64")
 # Apple A12 Bionic chipset which is added in iPhone XS/XS Max/XR uses arm64e architecture.
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index d23c02e..5d1e13e 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -47,17 +47,23 @@
       endforeach()
     endif()
   else()
-    if(${ARCH} STREQUAL "x86_64")
-      set(PERLASM_STYLE nasm)
+    if(${ARCH} STREQUAL "aarch64")
+      set(PERLASM_STYLE win64)
+      set(ASM_EXT S)
+      enable_language(ASM)
     else()
-      set(PERLASM_STYLE win32n)
-      set(PERLASM_FLAGS "-DOPENSSL_IA32_SSE2")
-    endif()
-    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -gcv8")
+      if(${ARCH} STREQUAL "x86_64")
+        set(PERLASM_STYLE nasm)
+      else()
+        set(PERLASM_STYLE win32n)
+        set(PERLASM_FLAGS "-DOPENSSL_IA32_SSE2")
+      endif()
+      set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -gcv8")
 
-    # On Windows, we use the NASM output.
-    set(ASM_EXT asm)
-    enable_language(ASM_NASM)
+      # On Windows, we use the NASM output.
+      set(ASM_EXT asm)
+      enable_language(ASM_NASM)
+    endif()
   endif()
 endif()
 
@@ -258,6 +264,7 @@
   conf/conf.c
   cpu-aarch64-fuchsia.c
   cpu-aarch64-linux.c
+  cpu-aarch64-win.c
   cpu-arm-linux.c
   cpu-arm.c
   cpu-intel.c
diff --git a/crypto/cpu-aarch64-win.c b/crypto/cpu-aarch64-win.c
new file mode 100644
index 0000000..ee7f8e0
--- /dev/null
+++ b/crypto/cpu-aarch64-win.c
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018, Google Inc.
+ * Copyright (c) 2020, Arm Ltd.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/cpu.h>
+
+#if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \
+    !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <windows.h>
+
+#include <openssl/arm_arch.h>
+
+#include "internal.h"
+
+extern uint32_t OPENSSL_armcap_P;
+void OPENSSL_cpuid_setup(void) {
+  // We do not need to check for the presence of NEON, as Armv8-A always has it
+  OPENSSL_armcap_P |= ARMV7_NEON;
+
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) {
+    // These are all covered by one call in Windows
+    OPENSSL_armcap_P |= ARMV8_AES;
+    OPENSSL_armcap_P |= ARMV8_PMULL;
+    OPENSSL_armcap_P |= ARMV8_SHA1;
+    OPENSSL_armcap_P |= ARMV8_SHA256;
+  }
+}
+
+#endif
diff --git a/crypto/fipsmodule/bn/div.c b/crypto/fipsmodule/bn/div.c
index 27b591c..333c770 100644
--- a/crypto/fipsmodule/bn/div.c
+++ b/crypto/fipsmodule/bn/div.c
@@ -64,10 +64,10 @@
 #include "internal.h"
 
 
-#if !defined(BN_CAN_DIVIDE_ULLONG) && !defined(BN_CAN_USE_INLINE_ASM)
 // bn_div_words divides a double-width |h|,|l| by |d| and returns the result,
 // which must fit in a |BN_ULONG|.
-static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+OPENSSL_UNUSED static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l,
+                                            BN_ULONG d) {
   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
   int i, count = 2;
 
@@ -135,7 +135,6 @@
   ret |= q;
   return ret;
 }
-#endif  // !defined(BN_CAN_DIVIDE_ULLONG) && !defined(BN_CAN_USE_INLINE_ASM)
 
 static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
                                     BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl
index 58904d0..576f682 100755
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@@ -22,6 +22,7 @@
 ################################################################
 my $arch = sub {
     if ($flavour =~ /linux/)	{ ".arch\t".join(',',@_); }
+    elsif ($flavour =~ /win64/) { ".arch\t".join(',',@_); }
     else			{ ""; }
 };
 my $fpu = sub {
@@ -30,6 +31,7 @@
 };
 my $hidden = sub {
     if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+    elsif ($flavour =~ /win64/) { ""; }
     else			{ ".hidden\t".join(',',@_); }
 };
 my $comm = sub {
@@ -80,6 +82,15 @@
 					"#endif";
 				  }
 			        }
+    elsif ($flavour =~ /win64/) { if (join(',',@_) =~ /(\w+),%function/) {
+                # See https://sourceware.org/binutils/docs/as/Pseudo-Ops.html
+                # Per https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#coff-symbol-table,
+                # the type for functions is 0x20, or 32.
+                ".def $1\n".
+                "   .type 32\n".
+                ".endef";
+            }
+        }
     else			{ ""; }
 };
 my $size = sub {
@@ -155,7 +166,7 @@
 ___
 
 print "#if defined(__arm__)\n" if ($flavour eq "linux32");
-print "#if defined(__aarch64__)\n" if ($flavour eq "linux64");
+print "#if defined(__aarch64__)\n" if ($flavour eq "linux64" || $flavour eq "win64");
 
 print "#if defined(BORINGSSL_PREFIX)\n";
 print "#include <boringssl_prefix_symbols_asm.h>\n";
@@ -228,7 +239,7 @@
     print "\n";
 }
 
-print "#endif\n" if ($flavour eq "linux32" || $flavour eq "linux64");
+print "#endif\n" if ($flavour eq "linux32" || $flavour eq "linux64" || $flavour eq "win64");
 print "#endif  // !OPENSSL_NO_ASM\n";
 
 # See https://www.airs.com/blog/archives/518.
diff --git a/util/generate_build_files.py b/util/generate_build_files.py
index 1cc8af3..853a73a 100644
--- a/util/generate_build_files.py
+++ b/util/generate_build_files.py
@@ -37,6 +37,7 @@
     ('mac', 'x86_64', 'macosx', [], 'S'),
     ('win', 'x86', 'win32n', ['-DOPENSSL_IA32_SSE2'], 'asm'),
     ('win', 'x86_64', 'nasm', [], 'asm'),
+    ('win', 'aarch64', 'win64', [], 'S'),
 ]
 
 # NON_PERL_FILES enumerates assembly files that are not processed by the