Support three-argument instructions on x86-64.

Change-Id: I81c855cd4805d4a5016999669a0cb5261838f23a
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35224
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go
index ce1a8b2..593abec 100644
--- a/util/fipstools/delocate/delocate.go
+++ b/util/fipstools/delocate/delocate.go
@@ -801,6 +801,8 @@
 	// instrCombine merges the source and destination in some fashion, for example
 	// a 2-operand bitwise operation.
 	instrCombine
+	// instrThreeArg merges two sources into a destination in some fashion.
+	instrThreeArg
 	instrOther
 )
 
@@ -831,6 +833,11 @@
 			return instrCombine
 		}
 
+	case "sarxq", "shlxq", "shrxq":
+		if len(args) == 3 {
+			return instrThreeArg
+		}
+
 	case "vpbroadcastq":
 		if len(args) == 2 {
 			return instrTransformingMove
@@ -878,10 +885,24 @@
 	}
 }
 
-func saveRegister(w stringWriter, avoidReg string) (wrapperFunc, string) {
-	reg := "%rax"
-	if reg == avoidReg {
-		reg = "%rbx"
+func saveRegister(w stringWriter, avoidRegs []string) (wrapperFunc, string) {
+	candidates := []string{"%rax", "%rbx", "%rcx", "%rdx"}
+
+	var reg string
+NextCandidate:
+	for _, candidate := range candidates {
+		for _, avoid := range avoidRegs {
+			if candidate == avoid {
+				continue NextCandidate
+			}
+		}
+
+		reg = candidate
+		break
+	}
+
+	if len(reg) == 0 {
+		panic("too many excluded registers")
 	}
 
 	return func(k func()) {
@@ -918,6 +939,13 @@
 	}
 }
 
+func threeArgCombineOp(w stringWriter, instructionName, source1, source2, dest string) wrapperFunc {
+	return func(k func()) {
+		k()
+		w.WriteString("\t" + instructionName + " " + source1 + ", " + source2 + ", " + dest + "\n")
+	}
+}
+
 func isValidLEATarget(reg string) bool {
 	return !strings.HasPrefix(reg, "%xmm") && !strings.HasPrefix(reg, "%ymm") && !strings.HasPrefix(reg, "%zmm")
 }
@@ -1033,9 +1061,6 @@
 				if len(offset) > 0 {
 					return nil, errors.New("loading from GOT with offset is unsupported")
 				}
-				if i != 0 {
-					return nil, errors.New("GOT access must be source operand")
-				}
 				if !d.isRIPRelative(memRef) {
 					return nil, errors.New("GOT access must be IP-relative")
 				}
@@ -1048,10 +1073,15 @@
 					useGOT = true
 				}
 
+				classification := classifyInstruction(instructionName, argNodes)
+				if classification != instrThreeArg && i != 0 {
+					return nil, errors.New("GOT access must be source operand")
+				}
+
 				// Reduce the instruction to movq symbol@GOTPCREL, targetReg.
 				var targetReg string
 				var redzoneCleared bool
-				switch classifyInstruction(instructionName, argNodes) {
+				switch classification {
 				case instrPush:
 					wrappers = append(wrappers, push(d.output))
 					targetReg = "%rax"
@@ -1073,12 +1103,36 @@
 					if !isValidLEATarget(targetReg) {
 						return nil, fmt.Errorf("cannot handle combining instructions targeting non-general registers")
 					}
-					saveRegWrapper, tempReg := saveRegister(d.output, targetReg)
+					saveRegWrapper, tempReg := saveRegister(d.output, []string{targetReg})
 					redzoneCleared = true
 					wrappers = append(wrappers, saveRegWrapper)
 
 					wrappers = append(wrappers, combineOp(d.output, instructionName, tempReg, targetReg))
 					targetReg = tempReg
+				case instrThreeArg:
+					if n := len(argNodes); n != 3 {
+						return nil, fmt.Errorf("three-argument instruction has %d arguments", n)
+					}
+					if i != 0 && i != 1 {
+						return nil, errors.New("GOT access must be from soure operand")
+					}
+					targetReg = d.contents(argNodes[2])
+
+					otherSource := d.contents(argNodes[1])
+					if i == 1 {
+						otherSource = d.contents(argNodes[0])
+					}
+
+					saveRegWrapper, tempReg := saveRegister(d.output, []string{targetReg, otherSource})
+					redzoneCleared = true
+					wrappers = append(wrappers, saveRegWrapper)
+
+					if i == 0 {
+						wrappers = append(wrappers, threeArgCombineOp(d.output, instructionName, tempReg, otherSource, targetReg))
+					} else {
+						wrappers = append(wrappers, threeArgCombineOp(d.output, instructionName, otherSource, tempReg, targetReg))
+					}
+					targetReg = tempReg
 				default:
 					return nil, fmt.Errorf("Cannot rewrite GOTPCREL reference for instruction %q", instructionName)
 				}
@@ -1087,7 +1141,7 @@
 					// Sometimes the compiler will load from the GOT to an
 					// XMM register, which is not a valid target of an LEA
 					// instruction.
-					saveRegWrapper, tempReg := saveRegister(d.output, "")
+					saveRegWrapper, tempReg := saveRegister(d.output, nil)
 					wrappers = append(wrappers, saveRegWrapper)
 					isAVX := strings.HasPrefix(instructionName, "v")
 					wrappers = append(wrappers, moveTo(d.output, targetReg, isAVX, tempReg))
diff --git a/util/fipstools/delocate/delocate_test.go b/util/fipstools/delocate/delocate_test.go
index e0ecc17..269b484 100644
--- a/util/fipstools/delocate/delocate_test.go
+++ b/util/fipstools/delocate/delocate_test.go
@@ -48,6 +48,7 @@
 	{"x86_64-GOTRewrite", []string{"in.s"}, "out.s"},
 	{"x86_64-LabelRewrite", []string{"in1.s", "in2.s"}, "out.s"},
 	{"x86_64-Sections", []string{"in.s"}, "out.s"},
+	{"x86_64-ThreeArg", []string{"in.s"}, "out.s"},
 }
 
 func TestDelocate(t *testing.T) {
diff --git a/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s b/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s
new file mode 100644
index 0000000..78a09de
--- /dev/null
+++ b/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s
@@ -0,0 +1,14 @@
+	.type foo, @function
+	.globl foo
+foo:
+	movq	%rax, %rax
+	shrxq	%rbx, kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rax
+	shrxq	kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rbx, %rax
+
+
+	.type	kBoringSSLRSASqrtTwo,@object # @kBoringSSLRSASqrtTwo
+	.section	.rodata,"a",@progbits,unique,760
+	.globl	kBoringSSLRSASqrtTwo
+	.p2align	4
+kBoringSSLRSASqrtTwo:
+	.quad	-2404814165548301886    # 0xdea06241f7aa81c2
diff --git a/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s b/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s
new file mode 100644
index 0000000..7ad8f76
--- /dev/null
+++ b/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s
@@ -0,0 +1,114 @@
+.text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
+BORINGSSL_bcm_text_start:
+	.type foo, @function
+	.globl foo
+.Lfoo_local_target:
+foo:
+	movq	%rax, %rax
+# WAS shrxq	%rbx, kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rax
+	leaq -128(%rsp), %rsp
+	pushq %rcx
+	leaq	.LkBoringSSLRSASqrtTwo_local_target(%rip), %rcx
+	shrxq %rbx, %rcx, %rax
+	popq %rcx
+	leaq 128(%rsp), %rsp
+# WAS shrxq	kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rbx, %rax
+	leaq -128(%rsp), %rsp
+	pushq %rcx
+	leaq	.LkBoringSSLRSASqrtTwo_local_target(%rip), %rcx
+	shrxq %rcx, %rbx, %rax
+	popq %rcx
+	leaq 128(%rsp), %rsp
+
+
+	.type	kBoringSSLRSASqrtTwo,@object # @kBoringSSLRSASqrtTwo
+# WAS .section	.rodata,"a",@progbits,unique,760
+.text
+	.globl	kBoringSSLRSASqrtTwo
+	.p2align	4
+.LkBoringSSLRSASqrtTwo_local_target:
+kBoringSSLRSASqrtTwo:
+	.quad	-2404814165548301886    # 0xdea06241f7aa81c2
+.text
+.loc 1 2 0
+BORINGSSL_bcm_text_end:
+.type OPENSSL_ia32cap_get, @function
+.globl OPENSSL_ia32cap_get
+.LOPENSSL_ia32cap_get_local_target:
+OPENSSL_ia32cap_get:
+	leaq OPENSSL_ia32cap_P(%rip), %rax
+	ret
+.extern OPENSSL_ia32cap_P
+.type OPENSSL_ia32cap_addr_delta, @object
+.size OPENSSL_ia32cap_addr_delta, 8
+OPENSSL_ia32cap_addr_delta:
+.quad OPENSSL_ia32cap_P-OPENSSL_ia32cap_addr_delta
+.type BORINGSSL_bcm_text_hash, @object
+.size BORINGSSL_bcm_text_hash, 64
+BORINGSSL_bcm_text_hash:
+.byte 0xae
+.byte 0x2c
+.byte 0xea
+.byte 0x2a
+.byte 0xbd
+.byte 0xa6
+.byte 0xf3
+.byte 0xec
+.byte 0x97
+.byte 0x7f
+.byte 0x9b
+.byte 0xf6
+.byte 0x94
+.byte 0x9a
+.byte 0xfc
+.byte 0x83
+.byte 0x68
+.byte 0x27
+.byte 0xcb
+.byte 0xa0
+.byte 0xa0
+.byte 0x9f
+.byte 0x6b
+.byte 0x6f
+.byte 0xde
+.byte 0x52
+.byte 0xcd
+.byte 0xe2
+.byte 0xcd
+.byte 0xff
+.byte 0x31
+.byte 0x80
+.byte 0xa2
+.byte 0xd4
+.byte 0xc3
+.byte 0x66
+.byte 0xf
+.byte 0xc2
+.byte 0x6a
+.byte 0x7b
+.byte 0xf4
+.byte 0xbe
+.byte 0x39
+.byte 0xa2
+.byte 0xd7
+.byte 0x25
+.byte 0xdb
+.byte 0x21
+.byte 0x98
+.byte 0xe9
+.byte 0xd5
+.byte 0x53
+.byte 0xbf
+.byte 0x5c
+.byte 0x32
+.byte 0x6
+.byte 0x83
+.byte 0x34
+.byte 0xc
+.byte 0x65
+.byte 0x89
+.byte 0x52
+.byte 0xbd
+.byte 0x1f