Implement ABI testing for aarch64.

This caught a bug in bn_mul_mont. Tested manually on iOS and Android.

Change-Id: I1819fcd9ad34dbe3ba92bba952507d86dd12185a
Reviewed-on: https://boringssl-review.googlesource.com/c/34805
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 2b9479b..5cdfa40 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -114,6 +114,7 @@
     CRYPTO_ARCH_SOURCES
 
     chacha/chacha-armv8.${ASM_EXT}
+    test/trampoline-armv8.${ASM_EXT}
   )
 endif()
 
@@ -145,6 +146,7 @@
 perlasm(cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT} cipher_extra/asm/aes128gcmsiv-x86_64.pl)
 perlasm(cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT} cipher_extra/asm/chacha20_poly1305_x86_64.pl)
 perlasm(test/trampoline-armv4.${ASM_EXT} test/asm/trampoline-armv4.pl)
+perlasm(test/trampoline-armv8.${ASM_EXT} test/asm/trampoline-armv8.pl)
 perlasm(test/trampoline-x86.${ASM_EXT} test/asm/trampoline-x86.pl)
 perlasm(test/trampoline-x86_64.${ASM_EXT} test/asm/trampoline-x86_64.pl)
 
diff --git a/crypto/abi_self_test.cc b/crypto/abi_self_test.cc
index d47f37c..0ea7b32 100644
--- a/crypto/abi_self_test.cc
+++ b/crypto/abi_self_test.cc
@@ -22,9 +22,9 @@
 
 static bool test_function_ok;
 static int TestFunction(int a1, int a2, int a3, int a4, int a5, int a6, int a7,
-                        int a8, int a9, int a10) {
+                        int a8) {
   test_function_ok = a1 == 1 || a2 == 2 || a3 == 3 || a4 == 4 || a5 == 5 ||
-                     a6 == 6 || a7 == 7 || a8 == 8 || a9 == 9 || a10 == 10;
+                     a6 == 6 || a7 == 7 || a8 == 8;
   return 42;
 }
 
@@ -32,17 +32,17 @@
   EXPECT_NE(0, CHECK_ABI_NO_UNWIND(strcmp, "hello", "world"));
 
   test_function_ok = false;
-  EXPECT_EQ(42, CHECK_ABI_SEH(TestFunction, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
+  EXPECT_EQ(42, CHECK_ABI_SEH(TestFunction, 1, 2, 3, 4, 5, 6, 7, 8));
   EXPECT_TRUE(test_function_ok);
 
 #if defined(SUPPORTS_ABI_TEST)
   abi_test::internal::CallerState state;
   RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
   crypto_word_t argv[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+      1, 2, 3, 4, 5, 6, 7, 8,
   };
   CHECK_ABI_SEH(abi_test_trampoline,
-                reinterpret_cast<crypto_word_t>(TestFunction), &state, argv, 10,
+                reinterpret_cast<crypto_word_t>(TestFunction), &state, argv, 8,
                 0 /* no breakpoint */);
 
 #if defined(OPENSSL_X86_64)
@@ -340,3 +340,184 @@
                           "d15 was not restored after return");
 }
 #endif   // OPENSSL_ARM && SUPPORTS_ABI_TEST
+
+
+#if defined(OPENSSL_AARCH64) && defined(SUPPORTS_ABI_TEST)
+extern "C" {
+void abi_test_clobber_x0(void);
+void abi_test_clobber_x1(void);
+void abi_test_clobber_x2(void);
+void abi_test_clobber_x3(void);
+void abi_test_clobber_x4(void);
+void abi_test_clobber_x5(void);
+void abi_test_clobber_x6(void);
+void abi_test_clobber_x7(void);
+void abi_test_clobber_x8(void);
+void abi_test_clobber_x9(void);
+void abi_test_clobber_x10(void);
+void abi_test_clobber_x11(void);
+void abi_test_clobber_x12(void);
+void abi_test_clobber_x13(void);
+void abi_test_clobber_x14(void);
+void abi_test_clobber_x15(void);
+void abi_test_clobber_x16(void);
+void abi_test_clobber_x17(void);
+// x18 is the platform register and off limits.
+void abi_test_clobber_x19(void);
+void abi_test_clobber_x20(void);
+void abi_test_clobber_x21(void);
+void abi_test_clobber_x22(void);
+void abi_test_clobber_x23(void);
+void abi_test_clobber_x24(void);
+void abi_test_clobber_x25(void);
+void abi_test_clobber_x26(void);
+void abi_test_clobber_x27(void);
+void abi_test_clobber_x28(void);
+void abi_test_clobber_x29(void);
+
+void abi_test_clobber_d0(void);
+void abi_test_clobber_d1(void);
+void abi_test_clobber_d2(void);
+void abi_test_clobber_d3(void);
+void abi_test_clobber_d4(void);
+void abi_test_clobber_d5(void);
+void abi_test_clobber_d6(void);
+void abi_test_clobber_d7(void);
+void abi_test_clobber_d8(void);
+void abi_test_clobber_d9(void);
+void abi_test_clobber_d10(void);
+void abi_test_clobber_d11(void);
+void abi_test_clobber_d12(void);
+void abi_test_clobber_d13(void);
+void abi_test_clobber_d14(void);
+void abi_test_clobber_d15(void);
+void abi_test_clobber_d16(void);
+void abi_test_clobber_d17(void);
+void abi_test_clobber_d18(void);
+void abi_test_clobber_d19(void);
+void abi_test_clobber_d20(void);
+void abi_test_clobber_d21(void);
+void abi_test_clobber_d22(void);
+void abi_test_clobber_d23(void);
+void abi_test_clobber_d24(void);
+void abi_test_clobber_d25(void);
+void abi_test_clobber_d26(void);
+void abi_test_clobber_d27(void);
+void abi_test_clobber_d28(void);
+void abi_test_clobber_d29(void);
+void abi_test_clobber_d30(void);
+void abi_test_clobber_d31(void);
+
+void abi_test_clobber_v8_upper(void);
+void abi_test_clobber_v9_upper(void);
+void abi_test_clobber_v10_upper(void);
+void abi_test_clobber_v11_upper(void);
+void abi_test_clobber_v12_upper(void);
+void abi_test_clobber_v13_upper(void);
+void abi_test_clobber_v14_upper(void);
+void abi_test_clobber_v15_upper(void);
+}  // extern "C"
+
+TEST(ABITest, AArch64) {
+  // abi_test_trampoline hides unsaved registers from the caller, so we can
+  // safely call the abi_test_clobber_* functions below.
+  abi_test::internal::CallerState state;
+  RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
+  CHECK_ABI_NO_UNWIND(abi_test_trampoline,
+                      reinterpret_cast<crypto_word_t>(abi_test_clobber_x19),
+                      &state, nullptr, 0, 0 /* no breakpoint */);
+
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x0);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x1);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x2);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x3);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x4);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x5);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x6);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x7);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x8);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x9);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x10);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x11);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x12);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x13);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x14);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x15);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x16);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_x17);
+
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x19),
+                          "x19 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x20),
+                          "x20 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x21),
+                          "x21 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x22),
+                          "x22 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x23),
+                          "x23 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x24),
+                          "x24 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x25),
+                          "x25 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x26),
+                          "x26 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x27),
+                          "x27 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x28),
+                          "x28 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_x29),
+                          "x29 was not restored after return");
+
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d0);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d1);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d2);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d3);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d4);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d5);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d6);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d7);
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d8),
+                          "d8 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d9),
+                          "d9 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d10),
+                          "d10 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d11),
+                          "d11 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d12),
+                          "d12 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d13),
+                          "d13 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d14),
+                          "d14 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d15),
+                          "d15 was not restored after return");
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d16);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d18);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d19);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d20);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d21);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d22);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d23);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d24);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d25);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d26);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d27);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d28);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d29);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d30);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d31);
+
+  // The lower halves of v8-v15 (accessed as d8-d15) must be preserved, but not
+  // the upper halves.
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v8_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v9_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v10_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v11_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v12_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v13_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v14_upper);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_v15_upper);
+}
+#endif   // OPENSSL_AARCH64 && SUPPORTS_ABI_TEST
diff --git a/crypto/test/abi_test.h b/crypto/test/abi_test.h
index e04b26c..0d685ed 100644
--- a/crypto/test/abi_test.h
+++ b/crypto/test/abi_test.h
@@ -55,12 +55,15 @@
 };
 
 // LOOP_CALLER_STATE_REGISTERS is a macro that iterates over all registers the
-// callee is expected to save for the caller.
-//
-// TODO(davidben): Add support for other architectures.
+// callee is expected to save for the caller, with the exception of the stack
+// pointer. The stack pointer is tested implicitly by the function successfully
+// returning at all.
 #if defined(OPENSSL_X86_64)
+
+// References:
+// SysV64: https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+// Win64: https://docs.microsoft.com/en-us/cpp/build/x64-software-conventions?view=vs-2017#register-usage
 #if defined(OPENSSL_WINDOWS)
-// See https://docs.microsoft.com/en-us/cpp/build/x64-software-conventions?view=vs-2017#register-usage
 #define LOOP_CALLER_STATE_REGISTERS()  \
   CALLER_STATE_REGISTER(uint64_t, rbx) \
   CALLER_STATE_REGISTER(uint64_t, rbp) \
@@ -81,7 +84,6 @@
   CALLER_STATE_REGISTER(Reg128, xmm14) \
   CALLER_STATE_REGISTER(Reg128, xmm15)
 #else
-// See https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
 #define LOOP_CALLER_STATE_REGISTERS()  \
   CALLER_STATE_REGISTER(uint64_t, rbx) \
   CALLER_STATE_REGISTER(uint64_t, rbp) \
@@ -90,19 +92,31 @@
   CALLER_STATE_REGISTER(uint64_t, r14) \
   CALLER_STATE_REGISTER(uint64_t, r15)
 #endif  // OPENSSL_WINDOWS
+
 #elif defined(OPENSSL_X86)
-// See https://uclibc.org/docs/psABI-i386.pdf and
-// https://docs.microsoft.com/en-us/cpp/cpp/argument-passing-and-naming-conventions?view=vs-2017
+
+// References:
+// SysV32: https://uclibc.org/docs/psABI-i386.pdf and
+// Win32: https://docs.microsoft.com/en-us/cpp/cpp/argument-passing-and-naming-conventions?view=vs-2017
 #define LOOP_CALLER_STATE_REGISTERS()  \
   CALLER_STATE_REGISTER(uint32_t, esi) \
   CALLER_STATE_REGISTER(uint32_t, edi) \
   CALLER_STATE_REGISTER(uint32_t, ebx) \
   CALLER_STATE_REGISTER(uint32_t, ebp)
+
 #elif defined(OPENSSL_ARM)
-// Unlike x86, ARM has a common ABI across all platforms, described in
-// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0042f/IHI0042F_aapcs.pdf
-// It almost specifies the callee-saved registers, except r9 is left to the
-// platform. Android and iOS differ in handling of r9.
+
+// References:
+// AAPCS: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0042f/IHI0042F_aapcs.pdf
+// iOS32: https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARMv6FunctionCallingConventions.html
+//
+// ARM specifies a common calling convention, except r9 is left to the
+// platform. Linux and iOS differ in handling of r9. iOS's behavior is defined
+// below. We found no clear reference for Linux but observed behavior from
+// LLVM. iOS 3+ treats r9 as caller-saved, while Linux treats it as
+// callee-saved. Most of our assembly treats it as callee-saved to be uniform,
+// but we match the platform to avoid false positives when testing
+// compiler-generated output.
 #define LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
   CALLER_STATE_REGISTER(uint64_t, d8)        \
   CALLER_STATE_REGISTER(uint64_t, d9)        \
@@ -121,24 +135,53 @@
   CALLER_STATE_REGISTER(uint32_t, r10)        \
   CALLER_STATE_REGISTER(uint32_t, r11)
 #if defined(OPENSSL_APPLE)
-// Starting iOS 3, r9 is treated as a caller-saved register. Before that, it
-// could not be used at all. Most of our assembly treats it as callee-saved
-// anyway to be uniform, but we match the platform to avoid false positives when
-// testing compiler-generated output.
-//
-// https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARMv6FunctionCallingConventions.html
-#define LOOP_CALLER_STATE_REGISTERS() \
+#define LOOP_CALLER_STATE_REGISTERS()  \
   LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
   LOOP_CALLER_STATE_REGISTERS_POST_R9()
-#else
-// We found no clear reference which defines Linux's use of r9, but LLVM treats
-// r9 as callee-saved on non-Apple ARM platforms.
-#define LOOP_CALLER_STATE_REGISTERS() \
+#else  // !OPENSSL_APPLE
+#define LOOP_CALLER_STATE_REGISTERS()  \
   LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
-  CALLER_STATE_REGISTER(uint32_t, r9) \
+  CALLER_STATE_REGISTER(uint32_t, r9)  \
   LOOP_CALLER_STATE_REGISTERS_POST_R9()
 #endif  // OPENSSL_APPLE
-#endif  // X86_64 || X86 || ARM
+
+#elif defined(OPENSSL_AARCH64)
+
+// References:
+// AAPCS64: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+// iOS64: https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARM64FunctionCallingConventions.html
+//
+// In aarch64, r19 (x19 in a 64-bit context) is the platform register. iOS says
+// user code may not touch it. We found no clear reference for Linux. The iOS
+// behavior implies portable assembly cannot use it, and aarch64 has many
+// registers. Thus this framework ignores register's existence. We can test r19
+// violations with grep.
+#define LOOP_CALLER_STATE_REGISTERS()                                \
+  /* Per AAPCS64, section 5.1.2, only the bottom 64 bits of v8-v15 */ \
+  /* are preserved. These are accessed as dN. */                     \
+  CALLER_STATE_REGISTER(uint64_t, d8)                                \
+  CALLER_STATE_REGISTER(uint64_t, d9)                                \
+  CALLER_STATE_REGISTER(uint64_t, d10)                               \
+  CALLER_STATE_REGISTER(uint64_t, d11)                               \
+  CALLER_STATE_REGISTER(uint64_t, d12)                               \
+  CALLER_STATE_REGISTER(uint64_t, d13)                               \
+  CALLER_STATE_REGISTER(uint64_t, d14)                               \
+  CALLER_STATE_REGISTER(uint64_t, d15)                               \
+  /* For consistency with dN, use the 64-bit name xN, rather than */ \
+  /* the generic rN. */                                              \
+  CALLER_STATE_REGISTER(uint64_t, x19)                               \
+  CALLER_STATE_REGISTER(uint64_t, x20)                               \
+  CALLER_STATE_REGISTER(uint64_t, x21)                               \
+  CALLER_STATE_REGISTER(uint64_t, x22)                               \
+  CALLER_STATE_REGISTER(uint64_t, x23)                               \
+  CALLER_STATE_REGISTER(uint64_t, x24)                               \
+  CALLER_STATE_REGISTER(uint64_t, x25)                               \
+  CALLER_STATE_REGISTER(uint64_t, x26)                               \
+  CALLER_STATE_REGISTER(uint64_t, x27)                               \
+  CALLER_STATE_REGISTER(uint64_t, x28)                               \
+  CALLER_STATE_REGISTER(uint64_t, x29)
+
+#endif  // X86_64 || X86 || ARM || AARCH64
 
 // Enable ABI testing if all of the following are true.
 //
@@ -169,16 +212,28 @@
 
 template <typename T>
 inline crypto_word_t ToWord(T t) {
+#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) && \
+    !defined(OPENSSL_ARM) && !defined(OPENSSL_AARCH64)
+#error "Unknown architecture"
+#endif
   static_assert(sizeof(T) <= sizeof(crypto_word_t),
                 "T is larger than crypto_word_t");
-  // Functions declared to take arguments smaller than native words cannot
-  // assume anything about the unused bits.
+  static_assert(sizeof(T) >= 4, "types under four bytes are complicated");
+
+  // ABIs are complex around arguments that are smaller than native words. For
+  // 32-bit architectures, the rules above imply we only have word-sized
+  // arguments. For 64-bit architectures, we still have assembly functions which
+  // take |int|.
   //
-  // TODO(davidben): Find authoritative citations for all supported assembly
-  // architectures. This is based on observed behavior in Clang, GCC, and MSVC
-  // for x86_64. The results are complex.
+  // For aarch64, AAPCS64, section 5.4.2, clauses C.7 and C.14 says any
+  // remaining bits are unspecified. iOS64 contradicts this and says the callee
+  // extends arguments up to 32 bits, and only the upper 32 bits are
+  // unspecified. Rejecting parameters smaller than 32 bits avoids the
+  // divergence.
   //
-  // ABI rules here may be inferred from two kinds of experiments:
+  // TODO(davidben): Find authoritative citations for x86_64. For x86_64, I
+  // observed the behavior of Clang, GCC, and MSVC. ABI rules here may be
+  // inferred from two kinds of experiments:
   //
   // 1. When passing a value to a small-argument-taking function, does the
   //    compiler ensure unused bits are cleared, sign-extended, etc.? Tests for
@@ -188,28 +243,21 @@
   // 2. When compiling a small-argument-taking function, does the compiler make
   //    assumptions about unused bits of arguments?
   //
-  // Stack parameters are straightforward. As both caller and callee, all
-  // compilers consistently use the minimally-sized read and write. Both SysV
-  // and Windows ABIs tolerate and produce arbitrary values for unused stack
-  // parameter bits.
+  // MSVC for x86_64 is straightforward. It appears to tolerate and produce
+  // arbitrary values for unused bits, like AAPCS64.
   //
-  // MSVC also appears to tolerate and produce arbitrary values for unused
-  // register parameter bits. The SysV ABI is messier. GCC and Clang tolerate
-  // and produce arbitrary values for the upper 32 bits of each register, but
-  // types smaller than |int| are promoted before passing to a register. (Zero
-  // or sign extension depends on signedness of the type.) When compiling a
-  // callee, Clang takes advantage of this conversion, but I was unable to make
-  // GCC do so.
+  // GCC and Clang for x86_64 are more complex. They match MSVC for stack
+  // parameters. However, for register parameters, they behave like iOS64 and,
+  // as callers, extend up to 32 bits, leaving the remainder arbitrary. When
+  // compiling a callee, Clang takes advantage of this conversion, but I was
+  // unable to make GCC do so.
   //
   // Note that, although the Win64 rules are sufficient to require our assembly
   // be conservative, we wish for |CHECK_ABI| to support C-compiled functions,
   // so it must enforce the correct rules for each platform.
   //
-  // This is all a mess so, for now, do not support parameter types smaller than
-  // |int| in |CHECK_ABI|. In practice, assembly functions only use 4- and
-  // 8-byte values. (And, given this behavior, we should avoid parameters
-  // smaller than native words in all new code.)
-  static_assert(sizeof(T) >= 4, "types under four bytes are complicated");
+  // Fortunately, the |static_assert|s above cause all supported architectures
+  // to behave the same.
   crypto_word_t ret;
   // Filling extra bits with 0xaa will be vastly out of bounds for code
   // expecting either sign- or zero-extension. (0xaa is 0b10101010.)
@@ -230,7 +278,12 @@
 template <typename R, typename... Args>
 inline crypto_word_t CheckImpl(Result *out, bool unwind, R (*func)(Args...),
                                typename DeductionGuard<Args>::Type... args) {
-  static_assert(sizeof...(args) <= 10,
+  // We only support up to 8 arguments. This ensures all arguments on aarch64
+  // are passed in registers and avoids the iOS descrepancy around packing small
+  // arguments on the stack.
+  //
+  // https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARM64FunctionCallingConventions.html
+  static_assert(sizeof...(args) <= 8,
                 "too many arguments for abi_test_trampoline");
 
   // Allocate one extra entry so MSVC does not complain about zero-size arrays.
diff --git a/crypto/test/asm/trampoline-armv8.pl b/crypto/test/asm/trampoline-armv8.pl
new file mode 100755
index 0000000..aab5250
--- /dev/null
+++ b/crypto/test/asm/trampoline-armv8.pl
@@ -0,0 +1,209 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# This file defines helper functions for crypto/test/abi_test.h on aarch64. See
+# that header for details on how to use this.
+#
+# For convenience, this file is linked into libcrypto, where consuming builds
+# already support architecture-specific sources. The static linker should drop
+# this code in non-test binaries. This includes a shared library build of
+# libcrypto, provided --gc-sections (ELF), -dead_strip (iOS), or equivalent is
+# used.
+#
+# References:
+#
+# AAPCS64: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+# iOS ARM64: https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARM64FunctionCallingConventions.html
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($func, $state, $argv, $argc) = ("x0", "x1", "x2", "x3");
+my $code = <<____;
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    $state (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	$state, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [$state], #16
+	ldp	d10, d11, [$state], #16
+	ldp	d12, d13, [$state], #16
+	ldp	d14, d15, [$state], #16
+	ldp	x19, x20, [$state], #16
+	ldp	x21, x22, [$state], #16
+	ldp	x23, x24, [$state], #16
+	ldp	x25, x26, [$state], #16
+	ldp	x27, x28, [$state], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, $func
+	mov	x10, $argv
+	mov	x11, $argc
+
+	// Load parameters into registers.
+	cbz	x11, .Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x7, [x10], #8
+
+.Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	$state, [sp, #160]
+	stp	d8, d9, [$state], #16
+	stp	d10, d11, [$state], #16
+	stp	d12, d13, [$state], #16
+	stp	d14, d15, [$state], #16
+	stp	x19, x20, [$state], #16
+	stp	x21, x22, [$state], #16
+	stp	x23, x24, [$state], #16
+	stp	x25, x26, [$state], #16
+	stp	x27, x28, [$state], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	.Lx29_ok
+	str	xzr, [$state]
+
+.Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	ret
+.size	abi_test_trampoline,.-abi_test_trampoline
+____
+
+# abi_test_clobber_* zeros the corresponding register. These are used to test
+# the ABI-testing framework.
+foreach (0..29) {
+  # x18 is the platform register and off limits.
+  next if ($_ == 18);
+  $code .= <<____;
+.type	abi_test_clobber_x$_, %function
+.globl	abi_test_clobber_x$_
+.align	4
+abi_test_clobber_x$_:
+	mov	x$_, xzr
+	ret
+.size	abi_test_clobber_x$_,.-abi_test_clobber_x$_
+____
+}
+foreach (0..31) {
+  $code .= <<____;
+.type	abi_test_clobber_d$_, %function
+.globl	abi_test_clobber_d$_
+.align	4
+abi_test_clobber_d$_:
+	fmov	d$_, xzr
+	ret
+.size	abi_test_clobber_d$_,.-abi_test_clobber_d$_
+____
+}
+
+# abi_test_clobber_v*_upper clobbers only the upper half of v*. AAPCS64 only
+# requires the lower half (d*) be preserved.
+foreach (8..15) {
+  $code .= <<____;
+.type	abi_test_clobber_v${_}_upper, %function
+.globl	abi_test_clobber_v${_}_upper
+.align	4
+abi_test_clobber_v${_}_upper:
+	fmov	v${_}.d[1], xzr
+	ret
+.size	abi_test_clobber_v${_}_upper,.-abi_test_clobber_v${_}_upper
+____
+}
+
+print $code;
+close STDOUT;