Implement ABI testing for ARM.

Update-Note: There's some chance this'll break iOS since I was unable to
test it there. The iPad I have to test on is too new to run 32-bit code
at all.

Change-Id: I6593f91b67a5e8a82828237d3b69ed948b07922d
Reviewed-on: https://boringssl-review.googlesource.com/c/34725
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index a3cdc08..2b9479b 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -105,6 +105,7 @@
     chacha/chacha-armv4.${ASM_EXT}
     curve25519/asm/x25519-asm-arm.S
     poly1305/poly1305_arm_asm.S
+    test/trampoline-armv4.${ASM_EXT}
   )
 endif()
 
@@ -143,6 +144,7 @@
 perlasm(chacha/chacha-x86_64.${ASM_EXT} chacha/asm/chacha-x86_64.pl)
 perlasm(cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT} cipher_extra/asm/aes128gcmsiv-x86_64.pl)
 perlasm(cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT} cipher_extra/asm/chacha20_poly1305_x86_64.pl)
+perlasm(test/trampoline-armv4.${ASM_EXT} test/asm/trampoline-armv4.pl)
 perlasm(test/trampoline-x86.${ASM_EXT} test/asm/trampoline-x86.pl)
 perlasm(test/trampoline-x86_64.${ASM_EXT} test/asm/trampoline-x86_64.pl)
 
diff --git a/crypto/abi_self_test.cc b/crypto/abi_self_test.cc
index c5bace1..d47f37c 100644
--- a/crypto/abi_self_test.cc
+++ b/crypto/abi_self_test.cc
@@ -243,3 +243,100 @@
       << "CHECK_ABI did not insulate the caller from direction flag errors";
 }
 #endif   // OPENSSL_X86 && SUPPORTS_ABI_TEST
+
+#if defined(OPENSSL_ARM) && defined(SUPPORTS_ABI_TEST)
+extern "C" {
+void abi_test_clobber_r0(void);
+void abi_test_clobber_r1(void);
+void abi_test_clobber_r2(void);
+void abi_test_clobber_r3(void);
+void abi_test_clobber_r4(void);
+void abi_test_clobber_r5(void);
+void abi_test_clobber_r6(void);
+void abi_test_clobber_r7(void);
+void abi_test_clobber_r8(void);
+void abi_test_clobber_r9(void);
+void abi_test_clobber_r10(void);
+void abi_test_clobber_r11(void);
+void abi_test_clobber_r12(void);
+// r13, r14, and r15, are sp, lr, and pc, respectively.
+
+void abi_test_clobber_d0(void);
+void abi_test_clobber_d1(void);
+void abi_test_clobber_d2(void);
+void abi_test_clobber_d3(void);
+void abi_test_clobber_d4(void);
+void abi_test_clobber_d5(void);
+void abi_test_clobber_d6(void);
+void abi_test_clobber_d7(void);
+void abi_test_clobber_d8(void);
+void abi_test_clobber_d9(void);
+void abi_test_clobber_d10(void);
+void abi_test_clobber_d11(void);
+void abi_test_clobber_d12(void);
+void abi_test_clobber_d13(void);
+void abi_test_clobber_d14(void);
+void abi_test_clobber_d15(void);
+}  // extern "C"
+
+TEST(ABITest, ARM) {
+  // abi_test_trampoline hides unsaved registers from the caller, so we can
+  // safely call the abi_test_clobber_* functions below.
+  abi_test::internal::CallerState state;
+  RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
+  CHECK_ABI_NO_UNWIND(abi_test_trampoline,
+                      reinterpret_cast<crypto_word_t>(abi_test_clobber_r4),
+                      &state, nullptr, 0, 0 /* no breakpoint */);
+
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r0);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r1);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r2);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r3);
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r4),
+                          "r4 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r5),
+                          "r5 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r6),
+                          "r6 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r7),
+                          "r7 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r8),
+                          "r8 was not restored after return");
+#if defined(OPENSSL_APPLE)
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r9);
+#else
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r9),
+                          "r9 was not restored after return");
+#endif
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r10),
+                          "r10 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r11),
+                          "r11 was not restored after return");
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_r12);
+
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d0);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d1);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d2);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d3);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d4);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d5);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d6);
+  CHECK_ABI_NO_UNWIND(abi_test_clobber_d7);
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d8),
+                          "d8 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d9),
+                          "d9 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d10),
+                          "d10 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d11),
+                          "d11 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d12),
+                          "d12 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d13),
+                          "d13 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d14),
+                          "d14 was not restored after return");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_d15),
+                          "d15 was not restored after return");
+}
+#endif   // OPENSSL_ARM && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 9283cd2..7110f1c 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -176,5 +176,23 @@
 #endif  // GHASH_ASM_X86_64
   }
 #endif  // GHASH_ASM_X86 || GHASH_ASM_X86_64
+
+#if defined(GHASH_ASM_ARM)
+  if (gcm_neon_capable()) {
+    CHECK_ABI(gcm_init_neon, Htable, kH);
+    CHECK_ABI(gcm_gmult_neon, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_neon, X, Htable, buf, 16 * blocks);
+    }
+  }
+
+  if (gcm_pmull_capable()) {
+    CHECK_ABI(gcm_init_v8, Htable, kH);
+    CHECK_ABI(gcm_gmult_v8, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_v8, X, Htable, buf, 16 * blocks);
+    }
+  }
+#endif  // GHASH_ASM_ARM
 }
 #endif  // SUPPORTS_ABI_TEST && GHASH_ASM
diff --git a/crypto/test/abi_test.h b/crypto/test/abi_test.h
index bf25552..e04b26c 100644
--- a/crypto/test/abi_test.h
+++ b/crypto/test/abi_test.h
@@ -98,7 +98,47 @@
   CALLER_STATE_REGISTER(uint32_t, edi) \
   CALLER_STATE_REGISTER(uint32_t, ebx) \
   CALLER_STATE_REGISTER(uint32_t, ebp)
-#endif  // X86_64 || X86
+#elif defined(OPENSSL_ARM)
+// Unlike x86, ARM has a common ABI across all platforms, described in
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0042f/IHI0042F_aapcs.pdf
+// It almost specifies the callee-saved registers, except r9 is left to the
+// platform. Android and iOS differ in handling of r9.
+#define LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
+  CALLER_STATE_REGISTER(uint64_t, d8)        \
+  CALLER_STATE_REGISTER(uint64_t, d9)        \
+  CALLER_STATE_REGISTER(uint64_t, d10)       \
+  CALLER_STATE_REGISTER(uint64_t, d11)       \
+  CALLER_STATE_REGISTER(uint64_t, d12)       \
+  CALLER_STATE_REGISTER(uint64_t, d13)       \
+  CALLER_STATE_REGISTER(uint64_t, d14)       \
+  CALLER_STATE_REGISTER(uint64_t, d15)       \
+  CALLER_STATE_REGISTER(uint32_t, r4)        \
+  CALLER_STATE_REGISTER(uint32_t, r5)        \
+  CALLER_STATE_REGISTER(uint32_t, r6)        \
+  CALLER_STATE_REGISTER(uint32_t, r7)        \
+  CALLER_STATE_REGISTER(uint32_t, r8)
+#define LOOP_CALLER_STATE_REGISTERS_POST_R9() \
+  CALLER_STATE_REGISTER(uint32_t, r10)        \
+  CALLER_STATE_REGISTER(uint32_t, r11)
+#if defined(OPENSSL_APPLE)
+// Starting iOS 3, r9 is treated as a caller-saved register. Before that, it
+// could not be used at all. Most of our assembly treats it as callee-saved
+// anyway to be uniform, but we match the platform to avoid false positives when
+// testing compiler-generated output.
+//
+// https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARMv6FunctionCallingConventions.html
+#define LOOP_CALLER_STATE_REGISTERS() \
+  LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
+  LOOP_CALLER_STATE_REGISTERS_POST_R9()
+#else
+// We found no clear reference which defines Linux's use of r9, but LLVM treats
+// r9 as callee-saved on non-Apple ARM platforms.
+#define LOOP_CALLER_STATE_REGISTERS() \
+  LOOP_CALLER_STATE_REGISTERS_PRE_R9() \
+  CALLER_STATE_REGISTER(uint32_t, r9) \
+  LOOP_CALLER_STATE_REGISTERS_POST_R9()
+#endif  // OPENSSL_APPLE
+#endif  // X86_64 || X86 || ARM
 
 // Enable ABI testing if all of the following are true.
 //
diff --git a/crypto/test/asm/trampoline-armv4.pl b/crypto/test/asm/trampoline-armv4.pl
new file mode 100755
index 0000000..bfa67e4
--- /dev/null
+++ b/crypto/test/asm/trampoline-armv4.pl
@@ -0,0 +1,181 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# This file defines helper functions for crypto/test/abi_test.h on 32-bit
+# ARM. See that header for details on how to use this.
+#
+# For convenience, this file is linked into libcrypto, where consuming builds
+# already support architecture-specific sources. The static linker should drop
+# this code in non-test binaries. This includes a shared library build of
+# libcrypto, provided --gc-sections (ELF), -dead_strip (iOS), or equivalent is
+# used.
+#
+# References:
+#
+# AAPCS: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0042f/IHI0042F_aapcs.pdf
+# iOS ARMv6: https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARMv6FunctionCallingConventions.html
+# iOS ARMv7: https://developer.apple.com/library/archive/documentation/Xcode/Conceptual/iPhoneOSABIReference/Articles/ARMv7FunctionCallingConventions.html
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($func, $state, $argv, $argc) = ("r0", "r1", "r2", "r3");
+my $code = <<____;
+.syntax	unified
+
+.arch	armv7-a
+.fpu	vfp
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@                              const uint32_t *argv, size_t argc,
+@                              int unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+	@ Save parameters and all callee-saved registers. For convenience, we
+	@ save r9 on iOS even though it's volatile.
+	vstmdb	sp!, {d8-d15}
+	stmdb	sp!, {r0-r11,lr}
+
+	@ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+	@ bytes to keep it 8-byte-aligned (see APCS, section 5.3).
+	sub     sp, sp, #28
+
+	@ Every register in APCS is either non-volatile or a parameter (except
+	@ r9 on iOS), so this code, by the actual call, loses all its scratch
+	@ registers. First fill in stack parameters while there are registers
+	@ to spare.
+	cmp	$argc, #4
+	bls	.Lstack_args_done
+	mov	r4, sp				@ r4 is the output pointer.
+	add	r5, $argv, $argc, lsl #2	@ Set r5 to the end of argv.
+	add	$argv, $argv, #16		@ Skip four arguments.
+.Lstack_args_loop:
+	ldr	r6, [$argv], #4
+	cmp	$argv, r5
+	str	r6, [r4], #4
+	bne	.Lstack_args_loop
+
+.Lstack_args_done:
+	@ Load registers from |$state|.
+	vldmia	$state!, {d8-d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	ldmia	$state!, {r4-r8,r10-r11}
+#else
+	ldmia	$state!, {r4-r11}
+#endif
+
+	@ Load register parameters. This uses up our remaining registers, so we
+	@ repurpose lr as scratch space.
+	ldr	$argc, [sp, #40]	@ Reload argc.
+	ldr	lr, [sp, #36]		@ Load argv into lr.
+	cmp	$argc, #3
+	bhi	.Larg_r3
+	beq	.Larg_r2
+	cmp	$argc, #1
+	bhi	.Larg_r1
+	beq	.Larg_r0
+	b	.Largs_done
+
+.Larg_r3:
+	ldr	r3, [lr, #12]	@ argv[3]
+.Larg_r2:
+	ldr	r2, [lr, #8]	@ argv[2]
+.Larg_r1:
+	ldr	r1, [lr, #4]	@ argv[1]
+.Larg_r0:
+	ldr	r0, [lr]	@ argv[0]
+.Largs_done:
+
+	@ With every other register in use, load the function pointer into lr
+	@ and call the function.
+	ldr	lr, [sp, #28]
+	blx	lr
+
+	@ r1-r3 are free for use again. The trampoline only supports
+	@ single-return functions. Pass r4-r11 to the caller.
+	ldr	$state, [sp, #32]
+	vstmia	$state!, {d8-d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	stmia	$state!, {r4-r8,r10-r11}
+#else
+	stmia	$state!, {r4-r11}
+#endif
+
+	@ Unwind the stack and restore registers.
+	add	sp, sp, #44		@ 44 = 28+16
+	ldmia	sp!, {r4-r11,lr}	@ Skip r0-r3 (see +16 above).
+	vldmia	sp!, {d8-d15}
+
+	bx	lr
+.size	abi_test_trampoline,.-abi_test_trampoline
+____
+
+# abi_test_clobber_* zeros the corresponding register. These are used to test
+# the ABI-testing framework.
+foreach (0..12) {
+  # This loop skips r13 (sp), r14 (lr, implicitly clobbered by every call), and
+  # r15 (pc).
+  $code .= <<____;
+.type	abi_test_clobber_r$_, %function
+.globl	abi_test_clobber_r$_
+.align	4
+abi_test_clobber_r$_:
+	mov	r$_, #0
+	bx	lr
+.size	abi_test_clobber_r$_,.-abi_test_clobber_r$_
+____
+}
+
+foreach (0..15) {
+  my $lo = "s".(2*$_);
+  my $hi = "s".(2*$_+1);
+  $code .= <<____;
+.type	abi_test_clobber_d$_, %function
+.globl	abi_test_clobber_d$_
+.align	4
+abi_test_clobber_d$_:
+	mov	r0, #0
+	vmov	$lo, r0
+	vmov	$hi, r0
+	bx	lr
+.size	abi_test_clobber_d$_,.-abi_test_clobber_d$_
+____
+}
+
+print $code;
+close STDOUT;