Revise OPENSSL_ia32cap_P strategy to avoid TEXTRELs.

OPENSSL_ia32cap_addr avoids any relocations within the module, at the
cost of a runtime TEXTREL, which causes problems in some cases.
(Notably, if someone links us into a binary which uses the GCC "ifunc"
attribute, the loader crashes.)

We add a OPENSSL_ia32cap_addr_delta symbol (which is reachable
relocation-free from the module) stores the difference between
OPENSSL_ia32cap_P and its own address.  Next, reference
OPENSSL_ia32cap_P in code as usual, but always doing LEAQ (or the
equivalent GOTPCREL MOVQ) into a register first. This pattern we can
then transform into a LEAQ and ADDQ on OPENSSL_ia32cap_addr_delta.

ADDQ modifies the FLAGS register, so this is only a safe transformation
if we safe and restore flags first. That, in turn, is only a safe
transformation if code always uses %rsp as a stack pointer (specifically
everything below the stack must be fair game for scribbling over). Linux
delivers signals on %rsp, so this should already be an ABI requirement.
Further, we must clear the red zone (using LEAQ to avoid touching FLAGS)
which signal handlers may not scribble over.

This also fixes the GOTTPOFF logic to clear the red zone.

Change-Id: I4ca6133ab936d5a13d5c8ef265a12ab6bd0073c9
Reviewed-on: https://boringssl-review.googlesource.com/15545
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/crypto.c b/crypto/crypto.c
index 5d35289..a52b808 100644
--- a/crypto/crypto.c
+++ b/crypto/crypto.c
@@ -58,9 +58,6 @@
  * initialising it to zero, it becomes a "data symbol", which isn't so
  * affected. */
 uint32_t OPENSSL_ia32cap_P[4] = {0};
-#if !defined(BORINGSSL_FIPS)
-uint32_t *OPENSSL_ia32cap_addr = OPENSSL_ia32cap_P;
-#endif
 
 #elif defined(OPENSSL_PPC64LE)
 
diff --git a/crypto/fipsmodule/aes/asm/aes-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-x86_64.pl
index 362c1c9..4bf7db3 100755
--- a/crypto/fipsmodule/aes/asm/aes-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-x86_64.pl
@@ -1642,7 +1642,7 @@
 .align	16
 .globl	asm_AES_cbc_encrypt
 .type	asm_AES_cbc_encrypt,\@function,6
-.extern	OPENSSL_ia32cap_addr
+.extern	OPENSSL_ia32cap_P
 .hidden	asm_AES_cbc_encrypt
 asm_AES_cbc_encrypt:
 	cmp	\$0,%rdx	# check length
@@ -1664,7 +1664,7 @@
 	cmp	\$0,%r9
 	cmoveq	%r10,$sbox
 
-	mov	OPENSSL_ia32cap_addr(%rip),%r10
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
 	mov	(%r10), %r10d
 	cmp	\$$speed_limit,%rdx
 	jb	.Lcbc_slow_prologue
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 5810980..4ad0fb1 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -209,7 +209,7 @@
 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
 $code=".text\n";
-$code.=".extern	OPENSSL_ia32cap_addr\n";
+$code.=".extern	OPENSSL_ia32cap_P\n";
 
 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -1271,7 +1271,7 @@
 	lea	7($ctr),%r9
 	 mov	%r10d,0x60+12(%rsp)
 	bswap	%r9d
-	movq	OPENSSL_ia32cap_addr(%rip),%r10
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
 	 mov	4(%r10),%r10d
 	xor	$key0,%r9d
 	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
@@ -3772,7 +3772,7 @@
 	movdqa	$inout3,$in3
 	movdqu	0x50($inp),$inout5
 	movdqa	$inout4,$in4
-	movq	OPENSSL_ia32cap_addr(%rip),%r9
+	leaq	OPENSSL_ia32cap_P(%rip),%r9
 	mov	4(%r9),%r9d
 	cmp	\$0x70,$len
 	jbe	.Lcbc_dec_six_or_seven
@@ -4278,7 +4278,7 @@
 
 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
 	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
-	movq	OPENSSL_ia32cap_addr(%rip),%r10
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
 	movl	4(%r10),%r10d
 	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
 	lea	16($key),%rax		# %rax is used as modifiable copy of $key
diff --git a/crypto/fipsmodule/delocate.go b/crypto/fipsmodule/delocate.go
index 2a69255..7b5454c 100644
--- a/crypto/fipsmodule/delocate.go
+++ b/crypto/fipsmodule/delocate.go
@@ -121,16 +121,6 @@
 	return symbols
 }
 
-func referencesIA32CapDirectly(line string) bool {
-	const symbol = "OPENSSL_ia32cap_P"
-	i := strings.Index(line, symbol)
-	if i < 0 {
-		return false
-	}
-	i += len(symbol)
-	return i == len(line) || line[i] == '+' || line[i] == '(' || line[i] == '@'
-}
-
 // threadLocalOffsetFunc describes a function that fetches the offset to symbol
 // in the thread-local space and writes it to the given target register.
 type threadLocalOffsetFunc struct {
@@ -231,9 +221,9 @@
 	// redirector function for that symbol.
 	redirectors := make(map[string]string)
 
-	// ia32capAddrNeeded is true iff OPENSSL_ia32cap_addr has been
-	// referenced and thus needs to be emitted outside the module.
-	ia32capAddrNeeded := false
+	// ia32capAddrDeltaNeeded is true iff OPENSSL_ia32cap_addr_delta has
+	// been referenced and thus needs to be emitted outside the module.
+	ia32capAddrDeltaNeeded := false
 
 	// ia32capGetNeeded is true iff OPENSSL_ia32cap_get has been referenced
 	// and thus needs to be emitted outside the module.
@@ -258,14 +248,6 @@
 			break
 		}
 
-		if referencesIA32CapDirectly(line) {
-			panic("reference to OPENSSL_ia32cap_P needs to be changed to indirect via OPENSSL_ia32cap_addr")
-		}
-
-		if strings.Contains(line, "OPENSSL_ia32cap_addr(%rip)") {
-			ia32capAddrNeeded = true
-		}
-
 		if strings.Contains(line, "OPENSSL_ia32cap_get@PLT") {
 			ia32capGetNeeded = true
 		}
@@ -326,7 +308,9 @@
 				symbol := strings.SplitN(args[0], "@", 2)[0]
 				functionName := fmt.Sprintf("BORINGSSL_bcm_tpoff_to_%s_for_%s", targetRegister, symbol)
 				threadLocalOffsets[functionName] = threadLocalOffsetFunc{target: targetRegister, symbol: symbol}
+				ret = append(ret, "leaq -128(%rsp), %rsp") // Clear the red zone.
 				ret = append(ret, "\tcallq "+functionName+"\n")
+				ret = append(ret, "leaq 128(%rsp), %rsp")
 				continue
 			}
 
@@ -351,6 +335,35 @@
 					// referencing the symbol directly,
 					// needs to be transformed into an LEA.
 					line = strings.Replace(line, "movq", "leaq", 1)
+					instr = "leaq"
+				}
+
+				if target == "OPENSSL_ia32cap_P" {
+					if instr != "leaq" {
+						panic("reference to OPENSSL_ia32cap_P needs to be changed to go through leaq or GOTPCREL")
+					}
+					if args[1][0] != '%' {
+						panic("reference to OPENSSL_ia32cap_P must target a register.")
+					}
+
+					// We assume pushfq is safe, after
+					// clearing the red zone, because any
+					// signals will be delivered using
+					// %rsp. Thus perlasm and
+					// compiler-generated code must not use
+					// %rsp as a general-purpose register.
+					//
+					// TODO(davidben): This messes up CFI
+					// for a small window if %rsp is the CFI
+					// register.
+					ia32capAddrDeltaNeeded = true
+					ret = append(ret, "leaq -128(%rsp), %rsp") // Clear the red zone.
+					ret = append(ret, "pushfq")
+					ret = append(ret, fmt.Sprintf("leaq OPENSSL_ia32cap_addr_delta(%%rip), %s", args[1]))
+					ret = append(ret, fmt.Sprintf("addq OPENSSL_ia32cap_addr_delta(%%rip), %s", args[1]))
+					ret = append(ret, "popfq")
+					ret = append(ret, "leaq 128(%rsp), %rsp")
+					continue
 				}
 			}
 
@@ -468,12 +481,12 @@
 	}
 
 	// Emit an indirect reference to OPENSSL_ia32cap_P.
-	if ia32capAddrNeeded {
+	if ia32capAddrDeltaNeeded {
 		ret = append(ret, ".extern OPENSSL_ia32cap_P")
-		ret = append(ret, ".type OPENSSL_ia32cap_addr,@object")
-		ret = append(ret, ".size OPENSSL_ia32cap_addr,8")
-		ret = append(ret, "OPENSSL_ia32cap_addr:")
-		ret = append(ret, "\t.quad OPENSSL_ia32cap_P")
+		ret = append(ret, ".type OPENSSL_ia32cap_addr_delta,@object")
+		ret = append(ret, ".size OPENSSL_ia32cap_addr_delta,8")
+		ret = append(ret, "OPENSSL_ia32cap_addr_delta:")
+		ret = append(ret, "\t.quad OPENSSL_ia32cap_P-OPENSSL_ia32cap_addr_delta")
 	}
 
 	// Emit accessors for thread-local offsets.
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
index 1a74edf..1778ac0 100644
--- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -212,7 +212,7 @@
 
 $code=<<___;
 .text
-.extern	OPENSSL_ia32cap_addr
+.extern	OPENSSL_ia32cap_P
 
 .globl	gcm_gmult_4bit
 .type	gcm_gmult_4bit,\@function,2
@@ -644,7 +644,7 @@
 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
 
 $code.=<<___;
-	mov		OPENSSL_ia32cap_addr(%rip),%rax
+	leaq		OPENSSL_ia32cap_P(%rip),%rax
 	mov		4(%rax),%eax
 	cmp		\$0x30,$len
 	jb		.Lskip4x
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index 25a1cad..b269e84 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -236,14 +236,13 @@
 
 $code.=<<___;
 .text
-.extern	OPENSSL_ia32cap_addr
+.extern	OPENSSL_ia32cap_P
 
 .globl	sha1_block_data_order
 .type	sha1_block_data_order,\@function,3
 .align	16
 sha1_block_data_order:
-	lea	OPENSSL_ia32cap_addr(%rip),%r10
-	mov	(%r10),%r10
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
 	mov	0(%r10),%r9d
 	mov	4(%r10),%r8d
 	mov	8(%r10),%r10d
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index c638595..7ad7491 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -249,15 +249,14 @@
 $code=<<___;
 .text
 
-.extern	OPENSSL_ia32cap_addr
+.extern	OPENSSL_ia32cap_P
 .globl	$func
 .type	$func,\@function,3
 .align	16
 $func:
 ___
 $code.=<<___ if ($SZ==4 || $avx);
-	lea	OPENSSL_ia32cap_addr(%rip),%r11
-	mov	(%r11),%r11
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
 	mov	0(%r11),%r9d
 	mov	4(%r11),%r10d
 	mov	8(%r11),%r11d