delocate: support Aarch64 Add Aarch64 support to delocate. Since it's a modern ISA, it's actually not too bad once I understood the behaviour of the assembler. Change-Id: I105fede43b5196b7ff7bdbf1ee71c6cfa2fc1aab Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/44848 Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go index 95e0be5..dc34c68 100644 --- a/util/fipstools/delocate/delocate.go +++ b/util/fipstools/delocate/delocate.go
@@ -157,6 +157,8 @@ statement, err = d.processIntelInstruction(statement, node.up) case ppc64le: statement, err = d.processPPCInstruction(statement, node.up) + case aarch64: + statement, err = d.processAarch64Instruction(statement, node.up) default: panic("unknown processor") } @@ -348,6 +350,276 @@ return argNodes } +// Aarch64 support + +// gotHelperName returns the name of a synthesised function that returns an +// address from the GOT. +func gotHelperName(symbol string) string { + return ".Lboringssl_loadgot_" + symbol +} + +// loadAarch64Address emits instructions to put the address of |symbol| +// (optionally adjusted by |offsetStr|) into |targetReg|. +func (d *delocation) loadAarch64Address(statement *node32, targetReg string, symbol string, offsetStr string) (*node32, error) { + // There are two paths here: either the symbol is known to be local in which + // case adr is used to get the address (within 1MiB), or a GOT reference is + // really needed in which case the code needs to jump to a helper function. + // + // A helper function is needed because using code appears to be the only way + // to load a GOT value. On other platforms we have ".quad foo@GOT" outside of + // the module, but on Aarch64 that results in a "COPY" relocation and linker + // comments suggest it's a weird hack. So, for each GOT symbol needed, we emit + // a function outside of the module that returns the address from the GOT in + // x0. + + d.writeCommentedNode(statement) + + _, isKnown := d.symbols[symbol] + isLocal := strings.HasPrefix(symbol, ".L") + if isKnown || isLocal || isSynthesized(symbol) { + if isLocal { + symbol = d.mapLocalSymbol(symbol) + } else if isKnown { + symbol = localTargetName(symbol) + } + + d.output.WriteString("\tadr " + targetReg + ", " + symbol + offsetStr + "\n") + + return statement, nil + } + + if len(offsetStr) != 0 { + panic("non-zero offset for helper-based reference") + } + + var helperFunc string + if symbol == "OPENSSL_armcap_P" { + helperFunc = ".LOPENSSL_armcap_P_addr" + } else { + // GOT helpers also dereference the GOT entry, thus the subsequent ldr + // instruction, which would normally do the dereferencing, needs to be + // dropped. GOT helpers have to include the dereference because the + // assembler doesn't support ":got_lo12:foo" offsets except in an ldr + // instruction. + d.gotExternalsNeeded[symbol] = struct{}{} + helperFunc = gotHelperName(symbol) + } + + // Clear the red-zone. I can't find a definitive answer about whether Linux + // Aarch64 includes a red-zone, but Microsoft has a 16-byte one and Apple a + // 128-byte one. Thus conservatively clear a 128-byte red-zone. + d.output.WriteString("\tsub sp, sp, 128\n") + + // Save x0 (which will be stomped by the return value) and the link register + // to the stack. Then save the program counter into the link register and + // jump to the helper function. + d.output.WriteString("\tstp x0, lr, [sp, #-16]!\n") + d.output.WriteString("\tbl " + helperFunc + "\n") + + if targetReg == "x0" { + // If the target happens to be x0 then restore the link register from the + // stack and send the saved value of x0 to the zero register. + d.output.WriteString("\tldp xzr, lr, [sp], #16\n") + } else { + // Otherwise move the result into place and restore registers. + d.output.WriteString("\tmov " + targetReg + ", x0\n") + d.output.WriteString("\tldp x0, lr, [sp], #16\n") + } + + // Revert the red-zone adjustment. + d.output.WriteString("\tadd sp, sp, 128\n") + + return statement, nil +} + +func (d *delocation) processAarch64Instruction(statement, instruction *node32) (*node32, error) { + assertNodeType(instruction, ruleInstructionName) + instructionName := d.contents(instruction) + + argNodes := instructionArgs(instruction.next) + + switch instructionName { + case "cset", "csel", "csetm", "cneg", "csinv", "cinc", "csinc", "csneg": + // These functions are special because they take a condition-code name as + // an argument and that looks like a symbol reference. + d.writeNode(statement) + return statement, nil + + case "mrs": + // Functions that take special register names also look like a symbol + // reference to the parser. + d.writeNode(statement) + return statement, nil + + case "adrp": + // adrp always generates a relocation, even when the target symbol is in the + // same segment, because the page-offset of the code isn't known until link + // time. Thus adrp instructions are turned into either adr instructions + // (limiting the module to 1MiB offsets) or calls to helper functions, both of + // which load the full address. Later instructions, which add the low 12 bits + // of offset, are tweaked to remove the offset since it's already included. + // Loads of GOT symbols are slightly more complex because it's not possible to + // avoid dereferencing a GOT entry with Clang's assembler. Thus the later ldr + // instruction, which would normally do the dereferencing, is dropped + // completely. (Or turned into a mov if it targets a different register.) + assertNodeType(argNodes[0], ruleRegisterOrConstant) + targetReg := d.contents(argNodes[0]) + if !strings.HasPrefix(targetReg, "x") { + panic("adrp targetting register " + targetReg + ", which has the wrong size") + } + + var symbol, offset string + switch argNodes[1].pegRule { + case ruleGOTSymbolOffset: + symbol = d.contents(argNodes[1].up) + case ruleMemoryRef: + assertNodeType(argNodes[1].up, ruleSymbolRef) + node, empty := d.gatherOffsets(argNodes[1].up.up, "") + if len(empty) != 0 { + panic("prefix offsets found for adrp") + } + symbol = d.contents(node) + _, offset = d.gatherOffsets(node.next, "") + default: + panic("Unhandled adrp argument type " + rul3s[argNodes[1].pegRule]) + } + + return d.loadAarch64Address(statement, targetReg, symbol, offset) + } + + var args []string + changed := false + + for _, arg := range argNodes { + fullArg := arg + + switch arg.pegRule { + case ruleRegisterOrConstant, ruleLocalLabelRef, ruleARMConstantTweak: + args = append(args, d.contents(fullArg)) + + case ruleGOTSymbolOffset: + // These should only be arguments to adrp and thus unreachable. + panic("unreachable") + + case ruleMemoryRef: + ref := arg.up + + switch ref.pegRule { + case ruleSymbolRef: + // This is a branch. Either the target needs to be written to a local + // version of the symbol to ensure that no relocations are emitted, or + // it needs to jump to a redirector function. + symbol, _, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up) + changed = didChange + + if _, knownSymbol := d.symbols[symbol]; knownSymbol { + symbol = localTargetName(symbol) + changed = true + } else if !symbolIsLocal && !isSynthesized(symbol) { + redirector := redirectorName(symbol) + d.redirectors[symbol] = redirector + symbol = redirector + changed = true + } + + args = append(args, symbol) + + case ruleARMBaseIndexScale: + parts := ref.up + assertNodeType(parts, ruleARMRegister) + baseAddrReg := d.contents(parts) + parts = skipWS(parts.next) + + // Only two forms need special handling. First there's memory references + // like "[x*, :got_lo12:foo]". The base register here will have been the + // target of an adrp instruction to load the page address, but the adrp + // will have turned into loading the full address *and dereferencing it*, + // above. Thus this instruction needs to be dropped otherwise we'll be + // dereferencing twice. + // + // Second there are forms like "[x*, :lo12:foo]" where the code has used + // adrp to load the page address into x*. That adrp will have been turned + // into loading the full address so just the offset needs to be dropped. + + if parts != nil { + if parts.pegRule == ruleARMGOTLow12 { + if instructionName != "ldr" { + panic("Symbol reference outside of ldr instruction") + } + + if skipWS(parts.next) != nil || parts.up.next != nil { + panic("can't handle tweak or post-increment with symbol references") + } + + // The GOT helper already dereferenced the entry so, at most, just a mov + // is needed to put things in the right register. + d.writeCommentedNode(statement) + if baseAddrReg != args[0] { + d.output.WriteString("\tmov " + args[0] + ", " + baseAddrReg + "\n") + } + return statement, nil + } else if parts.pegRule == ruleLow12BitsSymbolRef { + if instructionName != "ldr" { + panic("Symbol reference outside of ldr instruction") + } + + if skipWS(parts.next) != nil || parts.up.next != nil { + panic("can't handle tweak or post-increment with symbol references") + } + + // Suppress the offset; adrp loaded the full address. + args = append(args, "["+baseAddrReg+"]") + changed = true + continue + } + } + + args = append(args, d.contents(fullArg)) + + case ruleLow12BitsSymbolRef: + // These are the second instruction in a pair: + // adrp x0, symbol // Load the page address into x0 + // add x1, x0, :lo12:symbol // Adds the page offset. + // + // The adrp instruction will have been turned into a sequence that loads + // the full address, above, thus the offset is turned into zero. If that + // results in the instruction being a nop, then it is deleted. + if instructionName != "add" { + panic(fmt.Sprintf("unsure how to handle %q instruction using lo12", instructionName)) + } + + if !strings.HasPrefix(args[0], "x") || !strings.HasPrefix(args[1], "x") { + panic("address arithmetic with incorrectly sized register") + } + + if args[0] == args[1] { + d.writeCommentedNode(statement) + return statement, nil + } + + args = append(args, "#0") + changed = true + + default: + panic(fmt.Sprintf("unhandled MemoryRef type %s", rul3s[ref.pegRule])) + } + + default: + panic(fmt.Sprintf("unknown instruction argument type %q", rul3s[arg.pegRule])) + } + } + + if changed { + d.writeCommentedNode(statement) + replacement := "\t" + instructionName + "\t" + strings.Join(args, ", ") + "\n" + d.output.WriteString(replacement) + } else { + d.writeNode(statement) + } + + return statement, nil +} + /* ppc64le [PABI]: “64-Bit ELF V2 ABI Specification. Power Architecture.” March 21st, @@ -1347,6 +1619,17 @@ return lastStatement, nil } +func writeAarch64Function(w stringWriter, funcName string, writeContents func(stringWriter)) { + w.WriteString(".p2align 2\n") + w.WriteString(".hidden " + funcName + "\n") + w.WriteString(".type " + funcName + ", @function\n") + w.WriteString(funcName + ":\n") + w.WriteString(".cfi_startproc\n") + writeContents(w) + w.WriteString(".cfi_endproc\n") + w.WriteString(".size " + funcName + ", .-" + funcName + "\n") +} + func transform(w stringWriter, inputs []inputFile) error { // symbols contains all defined symbols. symbols := make(map[string]struct{}) @@ -1481,7 +1764,8 @@ for _, name := range redirectorNames { redirector := d.redirectors[name] - if d.processor == ppc64le { + switch d.processor { + case ppc64le: w.WriteString(".section \".toc\", \"aw\"\n") w.WriteString(".Lredirector_toc_" + name + ":\n") w.WriteString(".quad " + name + "\n") @@ -1496,7 +1780,13 @@ w.WriteString("\tld 12, .Lredirector_toc_" + name + "@toc@l(12)\n") w.WriteString("\tmtctr 12\n") w.WriteString("\tbctr\n") - } else { + + case aarch64: + writeAarch64Function(w, redirector, func(w stringWriter) { + w.WriteString("\tb " + name + "\n") + }) + + case x86_64: w.WriteString(".type " + redirector + ", @function\n") w.WriteString(redirector + ":\n") w.WriteString("\tjmp\t" + name + "\n") @@ -1512,20 +1802,32 @@ // Emit BSS accessor functions. Each is a single LEA followed by RET. for _, name := range accessorNames { funcName := accessorName(name) - w.WriteString(".type " + funcName + ", @function\n") - w.WriteString(funcName + ":\n") target := d.bssAccessorsNeeded[name] - if d.processor == ppc64le { + switch d.processor { + case ppc64le: + w.WriteString(".type " + funcName + ", @function\n") + w.WriteString(funcName + ":\n") w.WriteString("\taddis 3, 2, " + target + "@toc@ha\n") w.WriteString("\taddi 3, 3, " + target + "@toc@l\n") w.WriteString("\tblr\n") - } else { + + case x86_64: + w.WriteString(".type " + funcName + ", @function\n") + w.WriteString(funcName + ":\n") w.WriteString("\tleaq\t" + target + "(%rip), %rax\n\tret\n") + + case aarch64: + writeAarch64Function(w, funcName, func(w stringWriter) { + w.WriteString("\tadrp x0, " + target + "\n") + w.WriteString("\tadd x0, x0, :lo12:" + target + "\n") + w.WriteString("\tret\n") + }) } } - if d.processor == ppc64le { + switch d.processor { + case ppc64le: loadTOCNames := sortedSet(d.tocLoaders) for _, symbolAndOffset := range loadTOCNames { parts := strings.SplitN(symbolAndOffset, "\x00", 2) @@ -1544,7 +1846,24 @@ w.WriteString(".LBORINGSSL_external_toc:\n") w.WriteString(".quad .TOC.-.LBORINGSSL_external_toc\n") - } else { + + case aarch64: + externalNames := sortedSet(d.gotExternalsNeeded) + for _, symbol := range externalNames { + writeAarch64Function(w, gotHelperName(symbol), func(w stringWriter) { + w.WriteString("\tadrp x0, :got:" + symbol + "\n") + w.WriteString("\tldr x0, [x0, :got_lo12:" + symbol + "]\n") + w.WriteString("\tret\n") + }) + } + + writeAarch64Function(w, ".LOPENSSL_armcap_P_addr", func(w stringWriter) { + w.WriteString("\tadrp x0, OPENSSL_armcap_P\n") + w.WriteString("\tadd x0, x0, :lo12:OPENSSL_armcap_P\n") + w.WriteString("\tret\n") + }) + + case x86_64: externalNames := sortedSet(d.gotExternalsNeeded) for _, name := range externalNames { parts := strings.SplitN(name, "@", 2) @@ -1819,6 +2138,8 @@ return x86_64 case "addis", "addi", "mflr": return ppc64le + case "str", "bl", "ldr", "st1": + return aarch64 } }