Fix delocate with the aarch64 NO_ASM build

This compiler sometimes emits code like this:

	adrp x10, .Llocal_data2+16
	ldr q0, [x10, :lo12:.Llocal_data2+16]

We transform it into:

	adr x10, .Llocal_data2+16
	ldr	q0, [x10]

Note this makes some assumptions on the compiler, which I've documented
in comments. We already have a similar assumption on ADRP + ADD pairs,
but it is a little more likely for the compiler to do this in the ADRP +
LDR case.

Hopefully we can get the delocate replacement working soon and make all
this moot.

Change-Id: Icf4ed701142a52edf38d285c0bc5d52c17032d4f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/71267
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go
index 884344d..64eb0cf 100644
--- a/util/fipstools/delocate/delocate.go
+++ b/util/fipstools/delocate/delocate.go
@@ -589,11 +589,15 @@
 							panic("Symbol reference outside of ldr instruction")
 						}
 
-						if skipWS(parts.next) != nil || parts.up.next != nil {
-							panic("can't handle tweak or post-increment with symbol references")
-						}
-
-						// Suppress the offset; adrp loaded the full address.
+						// Suppress the offset; adrp loaded the full address. This assumes the
+						// the compiler does not emit code like the following:
+						//
+						//   adrp x0, symbol
+						//   ldr x1, [x0, :lo12:symbol]
+						//   ldr x2, [x0, :lo12:symbol+4]
+						//
+						// Such code would only work if lo12(symbol+4) = lo12(symbol) + 4, but
+						// this is true when symbol is sufficiently aligned.
 						args = append(args, "["+baseAddrReg+"]")
 						changed = true
 						continue
@@ -610,6 +614,15 @@
 				// The adrp instruction will have been turned into a sequence that loads
 				// the full address, above, thus the offset is turned into zero. If that
 				// results in the instruction being a nop, then it is deleted.
+				//
+				// This assumes the compiler does not emit code like the following:
+				//
+				//   adrp x0, symbol
+				//   add x1, x0, :lo12:symbol
+				//   add x2, x0, :lo12:symbol+4
+				//
+				// Such code would only work if lo12(symbol+4) = lo12(symbol) + 4, but
+				// this is true when symbol is sufficiently aligned.
 				if instructionName != "add" {
 					panic(fmt.Sprintf("unsure how to handle %q instruction using lo12", instructionName))
 				}
diff --git a/util/fipstools/delocate/testdata/aarch64-Basic/in.s b/util/fipstools/delocate/testdata/aarch64-Basic/in.s
index f151c23..f93a83f 100644
--- a/util/fipstools/delocate/testdata/aarch64-Basic/in.s
+++ b/util/fipstools/delocate/testdata/aarch64-Basic/in.s
@@ -37,6 +37,10 @@
 	adrp x10, .Llocal_data2
 	ldr q0, [x10, :lo12:.Llocal_data2]
 
+	// Load from local symbol with offset
+	adrp x10, .Llocal_data2+16
+	ldr q0, [x10, :lo12:.Llocal_data2+16]
+
 	bl local_function
 
 	bl remote_function
diff --git a/util/fipstools/delocate/testdata/aarch64-Basic/out.s b/util/fipstools/delocate/testdata/aarch64-Basic/out.s
index c024610..4c3ec6d 100644
--- a/util/fipstools/delocate/testdata/aarch64-Basic/out.s
+++ b/util/fipstools/delocate/testdata/aarch64-Basic/out.s
@@ -80,6 +80,12 @@
 // WAS ldr q0, [x10, :lo12:.Llocal_data2]
 	ldr	q0, [x10]
 
+	// Load from local symbol with offset
+// WAS adrp x10, .Llocal_data2+16
+	adr x10, .Llocal_data2+16
+// WAS ldr q0, [x10, :lo12:.Llocal_data2+16]
+	ldr	q0, [x10]
+
 // WAS bl local_function
 	bl	.Llocal_function_local_target