From 72301a9863fb43ff26e9779a086e02cf02031ceb Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <murp@ibm.com>
Date: Tue, 9 Mar 2021 16:55:22 -0600
Subject: [PATCH] cmd/internal/obj: use prefix insn in MOV* opcodes for
 GOPPC64=power10

As background, Power10 adds prefixed load, store, and add immediate
instructions which encode 34b signed displacements. Likewise, they
also give the option to compute addresses against the PC. This enables
using simpler PC relative (PC-rel) relocations instead of maintaining a
dedicated pointer (the TOC) to the code/data blob on PPC64/linux.

Similary, there are several Go opcodes where it can be advantageous to
use prefixed instructions instead of composite sequences like oris/ori/add
to implement "MOVD <big const>, Rx" or "ADD <big const>, Rx, Ry", or
large offset load/stores like "MOVD <big constant>(Rx), Ry" using the same
framework which dynamically configures optab.

When selecting prefixed instruction forms, the assembler must also use
new relocations. These new relocations are always PC-rel by design, thus
code assembled as such has no implicit requirement to maintain a TOC
pointer when assembling shared objects. Thus, we can safely avoid
situations where some Go objects use a TOC pointer, and some do not. This
greatly simplifies linking Go objects. For more details about the
challenges of linking TOC and PC-rel compiled code, see the PPC64 ELFv2
ABI.

The TOC pointer in R2 is still maintained in those build configurations
which previously required it (e.x buildmode=pie). However, Go code built
with PC-rel relocations does not require the TOC pointer. A future
change could remove the overhead of maintaining a TOC pointer in those
build configurations.

This is enabled only for power10/ppc64le/linux.

A final noteworthy difference between the prefixed and regular load/store
instruction forms is the removal of the DS/DQ form restrictions. That
is, the immediate operand does not need to be aligned.

Updates #44549

Change-Id: If59c216d203c3eed963bfa08855e21771e6ed669
Reviewed-on: https://go-review.googlesource.com/c/go/+/355150
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
---
 src/cmd/asm/internal/asm/endtoend_test.go |  12 +-
 src/cmd/asm/internal/asm/testdata/ppc64.s |  27 +-
 src/cmd/internal/obj/ppc64/asm9.go        | 296 ++++++++++++++++++----
 src/cmd/internal/obj/ppc64/doc.go         |  29 +++
 4 files changed, 293 insertions(+), 71 deletions(-)
diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
index 3928e364ab..ef41667c8e 100644
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@@ -457,10 +457,14 @@ func TestLOONG64Encoder(t *testing.T) {
 }
 
 func TestPPC64EndToEnd(t *testing.T) {
-	testEndToEnd(t, "ppc64", "ppc64")
-
-	// The assembler accepts all instructions irrespective of the GOPPC64 value.
-	testEndToEnd(t, "ppc64", "ppc64_p10")
+	defer func(old int) { buildcfg.GOPPC64 = old }(buildcfg.GOPPC64)
+	for _, goppc64 := range []int{8, 9, 10} {
+		t.Logf("GOPPC64=power%d", goppc64)
+		buildcfg.GOPPC64 = goppc64
+		// Some pseudo-ops may assemble differently depending on GOPPC64
+		testEndToEnd(t, "ppc64", "ppc64")
+		testEndToEnd(t, "ppc64", "ppc64_p10")
+	}
 }
 
 func TestRISCVEndToEnd(t *testing.T) {
diff --git a/src/cmd/asm/internal/asm/testdata/ppc64.s b/src/cmd/asm/internal/asm/testdata/ppc64.s
index 367d7b77db..72ae796018 100644
--- a/src/cmd/asm/internal/asm/testdata/ppc64.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64.s
@@ -20,19 +20,19 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	MOVD $65536, R6                 // 64060001
 	MOVD $-32767, R5                // 38a08001
 	MOVD $-32768, R6                // 38c08000
-	MOVD $1234567, R5               // 6405001260a5d687
+	MOVD $1234567, R5               // 6405001260a5d687 or 0600001238a0d687
 	MOVW $1, R3                     // 38600001
 	MOVW $-1, R4                    // 3880ffff
 	MOVW $65535, R5                 // 6005ffff
 	MOVW $65536, R6                 // 64060001
 	MOVW $-32767, R5                // 38a08001
 	MOVW $-32768, R6                // 38c08000
-	MOVW $1234567, R5               // 6405001260a5d687
+	MOVW $1234567, R5               // 6405001260a5d687 or 0600001238a0d687
 	// Hex constant 0x80000001
-	MOVW $2147483649, R5            // 6405800060a50001
-	MOVD $2147483649, R5            // 6405800060a50001
+	MOVW $2147483649, R5            // 6405800060a50001 or 0600800038a00001
+	MOVD $2147483649, R5            // 6405800060a50001 or 0600800038a00001
 	// Hex constant 0xFFFFFFFF80000001
-	MOVD $-2147483647, R5    	// 3ca0800060a50001
+	MOVD $-2147483647, R5    	// 3ca0800060a50001 or 0603800038a00001
 	MOVD 8(R3), R4                  // e8830008
 	MOVD (R3)(R4), R5               // 7ca4182a
 	MOVD (R3)(R0), R5               // 7ca0182a
@@ -71,8 +71,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	MOVHBR (R3)(R4), R5             // 7ca41e2c
 	MOVHBR (R3)(R0), R5             // 7ca01e2c
 	MOVHBR (R3), R5                 // 7ca01e2c
-	MOVD $foo+4009806848(FP), R5    // 3ca1ef0138a5cc40
-	MOVD $foo(SB), R5               // 3ca0000038a50000
+	MOVD $foo+4009806848(FP), R5    // 3ca1ef0138a5cc40 or 0600ef0038a1cc40
+	MOVD $foo(SB), R5               // 3ca0000038a50000 or 0610000038a00000
 
 	MOVDU 8(R3), R4                 // e8830009
 	MOVDU (R3)(R4), R5              // 7ca4186a
@@ -156,16 +156,21 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	ADD $1, R3, R4                  // 38830001
 	ADD $-1, R4                     // 3884ffff
 	ADD $-1, R4, R5                 // 38a4ffff
-	ADD $65535, R5                  // 601fffff7cbf2a14
-	ADD $65535, R5, R6              // 601fffff7cdf2a14
+	ADD $65535, R5                  // 601fffff7cbf2a14 or 0600000038a5ffff
+	ADD $65535, R5, R6              // 601fffff7cdf2a14 or 0600000038c5ffff
 	ADD $65536, R6                  // 3cc60001
 	ADD $65536, R6, R7              // 3ce60001
 	ADD $-32767, R5                 // 38a58001
 	ADD $-32767, R5, R4             // 38858001
 	ADD $-32768, R6                 // 38c68000
 	ADD $-32768, R6, R5             // 38a68000
-	ADD $1234567, R5                // 641f001263ffd6877cbf2a14
-	ADD $1234567, R5, R6            // 641f001263ffd6877cdf2a14
+
+	//TODO: this compiles to add r5,r6,r0. It should be addi r5,r6,0.
+	//      this is OK since r0 == $0, but the latter is preferred.
+	ADD $0, R6, R5             	// 7ca60214
+
+	ADD $1234567, R5                // 641f001263ffd6877cbf2a14 or 0600001238a5d687
+	ADD $1234567, R5, R6            // 641f001263ffd6877cdf2a14 or 0600001238c5d687
 	ADDEX R3, R5, $3, R6            // 7cc32f54
 	ADDEX R3, $3, R5, R6            // 7cc32f54
 	ADDIS $8, R3                    // 3c630008
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 9469edaf4c..40258ca0b2 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -34,6 +34,7 @@ import (
 	"cmd/internal/objabi"
 	"encoding/binary"
 	"fmt"
+	"internal/buildcfg"
 	"log"
 	"math"
 	"math/bits"
@@ -58,6 +59,12 @@ const (
 	r0iszero = 1
 )
 
+const (
+	// R bit option in prefixed load/store/add D-form operations
+	PFX_R_ABS   = 0 // Offset is absolute
+	PFX_R_PCREL = 1 // Offset is relative to PC, RA should be 0
+)
+
 type Optab struct {
 	as    obj.As // Opcode
 	a1    uint8  // p.From argument (obj.Addr). p is of type obj.Prog.
@@ -108,8 +115,6 @@ var optab = []Optab{
 	{as: AADD, a1: C_UCON, a6: C_REG, type_: 20, size: 4},
 	{as: AADD, a1: C_ANDCON, a2: C_REG, a6: C_REG, type_: 22, size: 8},
 	{as: AADD, a1: C_ANDCON, a6: C_REG, type_: 22, size: 8},
-	{as: AADD, a1: C_LCON, a2: C_REG, a6: C_REG, type_: 22, size: 12},
-	{as: AADD, a1: C_LCON, a6: C_REG, type_: 22, size: 12},
 	{as: AADDIS, a1: C_ADDCON, a2: C_REG, a6: C_REG, type_: 20, size: 4},
 	{as: AADDIS, a1: C_ADDCON, a6: C_REG, type_: 20, size: 4},
 	{as: AADDC, a1: C_REG, a2: C_REG, a6: C_REG, type_: 2, size: 4},
@@ -211,64 +216,42 @@ var optab = []Optab{
 	{as: AMOVHBR, a1: C_REG, a6: C_XOREG, type_: 44, size: 4},
 	{as: AMOVHBR, a1: C_XOREG, a6: C_REG, type_: 45, size: 4},
 
-	{as: AMOVB, a1: C_ADDR, a6: C_REG, type_: 75, size: 12},
-	{as: AMOVB, a1: C_LOREG, a6: C_REG, type_: 36, size: 12},
 	{as: AMOVB, a1: C_SOREG, a6: C_REG, type_: 8, size: 8},
 	{as: AMOVB, a1: C_XOREG, a6: C_REG, type_: 109, size: 8},
-	{as: AMOVB, a1: C_REG, a6: C_ADDR, type_: 74, size: 8},
 	{as: AMOVB, a1: C_REG, a6: C_SOREG, type_: 7, size: 4},
-	{as: AMOVB, a1: C_REG, a6: C_LOREG, type_: 35, size: 8},
 	{as: AMOVB, a1: C_REG, a6: C_XOREG, type_: 108, size: 4},
 	{as: AMOVB, a1: C_REG, a6: C_REG, type_: 13, size: 4},
 
-	{as: AMOVBZ, a1: C_ADDR, a6: C_REG, type_: 75, size: 8},
-	{as: AMOVBZ, a1: C_LOREG, a6: C_REG, type_: 36, size: 8},
 	{as: AMOVBZ, a1: C_SOREG, a6: C_REG, type_: 8, size: 4},
 	{as: AMOVBZ, a1: C_XOREG, a6: C_REG, type_: 109, size: 4},
-	{as: AMOVBZ, a1: C_REG, a6: C_ADDR, type_: 74, size: 8},
 	{as: AMOVBZ, a1: C_REG, a6: C_SOREG, type_: 7, size: 4},
-	{as: AMOVBZ, a1: C_REG, a6: C_LOREG, type_: 35, size: 8},
 	{as: AMOVBZ, a1: C_REG, a6: C_XOREG, type_: 108, size: 4},
 	{as: AMOVBZ, a1: C_REG, a6: C_REG, type_: 13, size: 4},
 
 	{as: AMOVD, a1: C_ADDCON, a6: C_REG, type_: 3, size: 4},
 	{as: AMOVD, a1: C_ANDCON, a6: C_REG, type_: 3, size: 4},
 	{as: AMOVD, a1: C_UCON, a6: C_REG, type_: 3, size: 4},
-	{as: AMOVD, a1: C_LCON, a6: C_REG, type_: 19, size: 8},
 	{as: AMOVD, a1: C_SACON, a6: C_REG, type_: 3, size: 4},
-	{as: AMOVD, a1: C_LACON, a6: C_REG, type_: 26, size: 8},
-	{as: AMOVD, a1: C_ADDR, a6: C_REG, type_: 75, size: 8},
 	{as: AMOVD, a1: C_SOREG, a6: C_REG, type_: 8, size: 4},
 	{as: AMOVD, a1: C_XOREG, a6: C_REG, type_: 109, size: 4},
 	{as: AMOVD, a1: C_SOREG, a6: C_SPR, type_: 107, size: 8},
-	{as: AMOVD, a1: C_LOREG, a6: C_REG, type_: 36, size: 8},
-	{as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 8},
-	{as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 12},
 	{as: AMOVD, a1: C_SPR, a6: C_REG, type_: 66, size: 4},
-	{as: AMOVD, a1: C_REG, a6: C_ADDR, type_: 74, size: 8},
 	{as: AMOVD, a1: C_REG, a6: C_SOREG, type_: 7, size: 4},
 	{as: AMOVD, a1: C_REG, a6: C_XOREG, type_: 108, size: 4},
 	{as: AMOVD, a1: C_SPR, a6: C_SOREG, type_: 106, size: 8},
-	{as: AMOVD, a1: C_REG, a6: C_LOREG, type_: 35, size: 8},
 	{as: AMOVD, a1: C_REG, a6: C_SPR, type_: 66, size: 4},
 	{as: AMOVD, a1: C_REG, a6: C_REG, type_: 13, size: 4},
 
 	{as: AMOVW, a1: C_ADDCON, a6: C_REG, type_: 3, size: 4},
 	{as: AMOVW, a1: C_ANDCON, a6: C_REG, type_: 3, size: 4},
 	{as: AMOVW, a1: C_UCON, a6: C_REG, type_: 3, size: 4},
-	{as: AMOVW, a1: C_LCON, a6: C_REG, type_: 19, size: 8},
 	{as: AMOVW, a1: C_SACON, a6: C_REG, type_: 3, size: 4},
-	{as: AMOVW, a1: C_LACON, a6: C_REG, type_: 26, size: 8},
-	{as: AMOVW, a1: C_ADDR, a6: C_REG, type_: 75, size: 8},
 	{as: AMOVW, a1: C_CREG, a6: C_REG, type_: 68, size: 4},
 	{as: AMOVW, a1: C_SOREG, a6: C_REG, type_: 8, size: 4},
-	{as: AMOVW, a1: C_LOREG, a6: C_REG, type_: 36, size: 8},
 	{as: AMOVW, a1: C_XOREG, a6: C_REG, type_: 109, size: 4},
 	{as: AMOVW, a1: C_SPR, a6: C_REG, type_: 66, size: 4},
-	{as: AMOVW, a1: C_REG, a6: C_ADDR, type_: 74, size: 8},
 	{as: AMOVW, a1: C_REG, a6: C_CREG, type_: 69, size: 4},
 	{as: AMOVW, a1: C_REG, a6: C_SOREG, type_: 7, size: 4},
-	{as: AMOVW, a1: C_REG, a6: C_LOREG, type_: 35, size: 8},
 	{as: AMOVW, a1: C_REG, a6: C_XOREG, type_: 108, size: 4},
 	{as: AMOVW, a1: C_REG, a6: C_SPR, type_: 66, size: 4},
 	{as: AMOVW, a1: C_REG, a6: C_REG, type_: 13, size: 4},
@@ -276,14 +259,10 @@ var optab = []Optab{
 	{as: AFMOVD, a1: C_ADDCON, a6: C_FREG, type_: 24, size: 8},
 	{as: AFMOVD, a1: C_SOREG, a6: C_FREG, type_: 8, size: 4},
 	{as: AFMOVD, a1: C_XOREG, a6: C_FREG, type_: 109, size: 4},
-	{as: AFMOVD, a1: C_LOREG, a6: C_FREG, type_: 36, size: 8},
 	{as: AFMOVD, a1: C_ZCON, a6: C_FREG, type_: 24, size: 4},
-	{as: AFMOVD, a1: C_ADDR, a6: C_FREG, type_: 75, size: 8},
 	{as: AFMOVD, a1: C_FREG, a6: C_FREG, type_: 33, size: 4},
 	{as: AFMOVD, a1: C_FREG, a6: C_SOREG, type_: 7, size: 4},
 	{as: AFMOVD, a1: C_FREG, a6: C_XOREG, type_: 108, size: 4},
-	{as: AFMOVD, a1: C_FREG, a6: C_LOREG, type_: 35, size: 8},
-	{as: AFMOVD, a1: C_FREG, a6: C_ADDR, type_: 74, size: 8},
 
 	{as: AFMOVSX, a1: C_XOREG, a6: C_FREG, type_: 45, size: 4},
 	{as: AFMOVSX, a1: C_FREG, a6: C_XOREG, type_: 44, size: 4},
@@ -535,10 +514,71 @@ var optab = []Optab{
 	{as: obj.APCALIGN, a1: C_LCON, type_: 0, size: 0},   // align code
 }
 
+// These are opcodes above which may generate different sequences depending on whether prefix opcode support
+// is available
+type PrefixableOptab struct {
+	Optab
+	minGOPPC64 int  // Minimum GOPPC64 required to support this.
+	pfxsize    int8 // Instruction sequence size when prefixed opcodes are used
+}
+
+// The prefixable optab entry contains the pseudo-opcodes which generate relocations, or may generate
+// a more efficient sequence of instructions if a prefixed version exists (ex. paddi instead of oris/ori/add).
+//
+// This table is meant to transform all sequences which might be TOC-relative into an equivalent PC-relative
+// sequence. It also encompasses several transformations which do not involve relocations, those could be
+// separated and applied to AIX and other non-ELF targets. Likewise, the prefixed forms do not have encoding
+// restrictions on the offset, so they are also used for static binary to allow better code generation. e.x
+//
+//	MOVD something-byte-aligned(Rx), Ry
+//	MOVD 3(Rx), Ry
+//
+// is allowed when the prefixed forms are used.
+//
+// This requires an ISA 3.1 compatible cpu (e.g Power10), and when linking externally an ELFv2 1.5 compliant.
+var prefixableOptab = []PrefixableOptab{
+	{Optab: Optab{as: AMOVD, a1: C_LCON, a6: C_REG, type_: 19, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_ADDR, a6: C_REG, type_: 75, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 12}, minGOPPC64: 10, pfxsize: 12},
+	{Optab: Optab{as: AMOVD, a1: C_LACON, a6: C_REG, type_: 26, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_LOREG, a6: C_REG, type_: 36, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_REG, a6: C_LOREG, type_: 35, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVD, a1: C_REG, a6: C_ADDR, type_: 74, size: 8}, minGOPPC64: 10, pfxsize: 8},
+
+	{Optab: Optab{as: AMOVW, a1: C_LCON, a6: C_REG, type_: 19, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVW, a1: C_LACON, a6: C_REG, type_: 26, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVW, a1: C_LOREG, a6: C_REG, type_: 36, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVW, a1: C_ADDR, a6: C_REG, type_: 75, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVW, a1: C_REG, a6: C_LOREG, type_: 35, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVW, a1: C_REG, a6: C_ADDR, type_: 74, size: 8}, minGOPPC64: 10, pfxsize: 8},
+
+	{Optab: Optab{as: AMOVB, a1: C_REG, a6: C_LOREG, type_: 35, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVB, a1: C_LOREG, a6: C_REG, type_: 36, size: 12}, minGOPPC64: 10, pfxsize: 12},
+	{Optab: Optab{as: AMOVB, a1: C_ADDR, a6: C_REG, type_: 75, size: 12}, minGOPPC64: 10, pfxsize: 12},
+	{Optab: Optab{as: AMOVB, a1: C_REG, a6: C_ADDR, type_: 74, size: 8}, minGOPPC64: 10, pfxsize: 8},
+
+	{Optab: Optab{as: AMOVBZ, a1: C_LOREG, a6: C_REG, type_: 36, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVBZ, a1: C_ADDR, a6: C_REG, type_: 75, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVBZ, a1: C_REG, a6: C_LOREG, type_: 35, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AMOVBZ, a1: C_REG, a6: C_ADDR, type_: 74, size: 8}, minGOPPC64: 10, pfxsize: 8},
+
+	{Optab: Optab{as: AFMOVD, a1: C_LOREG, a6: C_FREG, type_: 36, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AFMOVD, a1: C_ADDR, a6: C_FREG, type_: 75, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AFMOVD, a1: C_FREG, a6: C_LOREG, type_: 35, size: 8}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AFMOVD, a1: C_FREG, a6: C_ADDR, type_: 74, size: 8}, minGOPPC64: 10, pfxsize: 8},
+
+	{Optab: Optab{as: AADD, a1: C_LCON, a2: C_REG, a6: C_REG, type_: 22, size: 12}, minGOPPC64: 10, pfxsize: 8},
+	{Optab: Optab{as: AADD, a1: C_LCON, a6: C_REG, type_: 22, size: 12}, minGOPPC64: 10, pfxsize: 8},
+}
+
 var oprange [ALAST & obj.AMask][]Optab
 
 var xcmp [C_NCLASS][C_NCLASS]bool
 
+var pfxEnabled = false // ISA 3.1 prefixed instructions are supported.
+var buildOpCfg = ""    // Save the os/cpu/arch tuple used to configure the assembler in buildop
+
 // padding bytes to add to align code as requested.
 func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int {
 	// For 16 and 32 byte alignment, there is a tradeoff
@@ -1256,12 +1296,34 @@ func opset(a, b0 obj.As) {
 
 // Build the opcode table
 func buildop(ctxt *obj.Link) {
-	if oprange[AANDN&obj.AMask] != nil {
-		// Already initialized; stop now.
+	// PC-rel relocation support is available only for targets which support
+	// ELFv2 1.5 (only power10/ppc64le/linux today).
+	pfxEnabled = buildcfg.GOPPC64 >= 10 && buildcfg.GOOS == "linux" && buildcfg.GOARCH == "ppc64le"
+	cfg := fmt.Sprintf("power%d/%s/%s", buildcfg.GOPPC64, buildcfg.GOARCH, buildcfg.GOOS)
+	if cfg == buildOpCfg {
+		// Already initialized to correct OS/cpu; stop now.
 		// This happens in the cmd/asm tests,
 		// each of which re-initializes the arch.
 		return
 	}
+	buildOpCfg = cfg
+
+	// Configure the optab entries which may generate prefix opcodes.
+	prefixOptab := make([]Optab, 0, len(prefixableOptab))
+	for _, entry := range prefixableOptab {
+		entry := entry
+		if pfxEnabled && buildcfg.GOPPC64 >= entry.minGOPPC64 {
+			// Enable prefix opcode generation and resize.
+			entry.ispfx = true
+			entry.size = entry.pfxsize
+		}
+		// Use the legacy assembler function if none provided.
+		if entry.asmout == nil {
+			entry.asmout = asmout
+		}
+		prefixOptab = append(prefixOptab, entry.Optab)
+
+	}
 
 	for i := 0; i < C_NCLASS; i++ {
 		for n := 0; n < C_NCLASS; n++ {
@@ -1278,7 +1340,9 @@ func buildop(ctxt *obj.Link) {
 	}
 	// Append the generated entries, sort, and fill out oprange.
 	optab = append(optab, optabGen...)
+	optab = append(optab, prefixOptab...)
 	sort.Slice(optab, optabLess)
+
 	for i := 0; i < len(optab); {
 		r := optab[i].as
 		r0 := r & obj.AMask
@@ -2227,6 +2291,13 @@ func AOP_ISEL(op uint32, t uint32, a uint32, b uint32, bc uint32) uint32 {
 	return op | (t&31)<<21 | (a&31)<<16 | (b&31)<<11 | (bc&0x1F)<<6
 }
 
+func AOP_PFX_00_8LS(r, ie uint32) uint32 {
+	return 1<<26 | 0<<24 | 0<<23 | (r&1)<<20 | (ie & 0x3FFFF)
+}
+func AOP_PFX_10_MLS(r, ie uint32) uint32 {
+	return 1<<26 | 2<<24 | 0<<23 | (r&1)<<20 | (ie & 0x3FFFF)
+}
+
 const (
 	/* each rhs is OPVCC(_, _, _, _) */
 	OP_ADD      = 31<<26 | 266<<1 | 0<<10 | 0
@@ -2266,6 +2337,52 @@ const (
 	OP_EXTSWSLI = 31<<26 | 445<<2
 )
 
+func pfxadd(rt, ra int16, r uint32, imm32 int64) (uint32, uint32) {
+	return AOP_PFX_10_MLS(r, uint32(imm32>>16)), AOP_IRR(14<<26, uint32(rt), uint32(ra), uint32(imm32))
+}
+
+func pfxload(a obj.As, reg int16, base int16, r uint32) (uint32, uint32) {
+	switch a {
+	case AMOVH:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(42<<26, uint32(reg), uint32(base), 0)
+	case AMOVW:
+		return AOP_PFX_00_8LS(r, 0), AOP_IRR(41<<26, uint32(reg), uint32(base), 0)
+	case AMOVD:
+		return AOP_PFX_00_8LS(r, 0), AOP_IRR(57<<26, uint32(reg), uint32(base), 0)
+	case AMOVBZ, AMOVB:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(34<<26, uint32(reg), uint32(base), 0)
+	case AMOVHZ:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(40<<26, uint32(reg), uint32(base), 0)
+	case AMOVWZ:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(32<<26, uint32(reg), uint32(base), 0)
+	case AFMOVS:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(48<<26, uint32(reg), uint32(base), 0)
+	case AFMOVD:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(50<<26, uint32(reg), uint32(base), 0)
+	}
+	log.Fatalf("Error no pfxload for %v\n", a)
+	return 0, 0
+}
+
+func pfxstore(a obj.As, reg int16, base int16, r uint32) (uint32, uint32) {
+	switch a {
+	case AMOVD:
+		return AOP_PFX_00_8LS(r, 0), AOP_IRR(61<<26, uint32(reg), uint32(base), 0)
+	case AMOVBZ, AMOVB:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(38<<26, uint32(reg), uint32(base), 0)
+	case AMOVHZ, AMOVH:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(44<<26, uint32(reg), uint32(base), 0)
+	case AMOVWZ, AMOVW:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(36<<26, uint32(reg), uint32(base), 0)
+	case AFMOVS:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(52<<26, uint32(reg), uint32(base), 0)
+	case AFMOVD:
+		return AOP_PFX_10_MLS(r, 0), AOP_IRR(54<<26, uint32(reg), uint32(base), 0)
+	}
+	log.Fatalf("Error no pfxstore for %v\n", a)
+	return 0, 0
+}
+
 func oclass(a *obj.Addr) int {
 	return int(a.Class) - 1
 }
@@ -2324,7 +2441,7 @@ func (c *ctxt9) opform(insn uint32) int {
 
 // Encode instructions and create relocation for accessing s+d according to the
 // instruction op with source or destination (as appropriate) register reg.
-func (c *ctxt9) symbolAccess(s *obj.LSym, d int64, reg int16, op uint32, reuse bool) (o1, o2 uint32) {
+func (c *ctxt9) symbolAccess(s *obj.LSym, d int64, reg int16, op uint32, reuse bool) (o1, o2 uint32, rel *obj.Reloc) {
 	if c.ctxt.Headtype == objabi.Haix {
 		// Every symbol access must be made via a TOC anchor.
 		c.ctxt.Diag("symbolAccess called for %s", s.Name)
@@ -2345,7 +2462,7 @@ func (c *ctxt9) symbolAccess(s *obj.LSym, d int64, reg int16, op uint32, reuse b
 		o1 = AOP_IRR(OP_ADDIS, uint32(reg), base, 0)
 		o2 = AOP_IRR(op, uint32(reg), uint32(reg), 0)
 	}
-	rel := obj.Addrel(c.cursym)
+	rel = obj.Addrel(c.cursym)
 	rel.Off = int32(c.pc)
 	rel.Siz = 8
 	rel.Sym = s
@@ -2841,8 +2958,12 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 
 	case 19: /* mov $lcon,r ==> cau+or */
 		d := c.vregoff(&p.From)
-		o1 = loadu32(int(p.To.Reg), d)
-		o2 = LOP_IRR(OP_ORI, uint32(p.To.Reg), uint32(p.To.Reg), uint32(int32(d)))
+		if o.ispfx {
+			o1, o2 = pfxadd(p.To.Reg, REG_R0, PFX_R_ABS, d)
+		} else {
+			o1 = loadu32(int(p.To.Reg), d)
+			o2 = LOP_IRR(OP_ORI, uint32(p.To.Reg), uint32(p.To.Reg), uint32(int32(d)))
+		}
 
 	case 20: /* add $ucon,,r | addis $addcon,r,r */
 		v := c.regoff(&p.From)
@@ -2883,6 +3004,10 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 			o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
 		}
 
+		if o.ispfx {
+			o1, o2 = pfxadd(int16(p.To.Reg), int16(r), PFX_R_ABS, d)
+		}
+
 	case 23: /* and $lcon/$addcon,r1,r2 ==> oris+ori+and/addi+and */
 		if p.To.Reg == REGTMP || p.Reg == REGTMP {
 			c.ctxt.Diag("can't synthesize large constant\n%v", p)
@@ -2962,11 +3087,12 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 	case 26: /* mov $lsext/auto/oreg,,r2 ==> addis+addi */
 		v := c.vregoff(&p.From)
 		r := int(p.From.Reg)
+		var rel *obj.Reloc
 
 		switch p.From.Name {
 		case obj.NAME_EXTERN, obj.NAME_STATIC:
 			// Load a 32 bit constant, or relocation depending on if a symbol is attached
-			o1, o2 = c.symbolAccess(p.From.Sym, v, p.To.Reg, OP_ADDI, true)
+			o1, o2, rel = c.symbolAccess(p.From.Sym, v, p.To.Reg, OP_ADDI, true)
 		default:
 			if r == 0 {
 				r = c.getimpliedreg(&p.From, p)
@@ -2976,6 +3102,15 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 			o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), uint32(p.To.Reg), uint32(v))
 		}
 
+		if o.ispfx {
+			if rel == nil {
+				o1, o2 = pfxadd(int16(p.To.Reg), int16(r), PFX_R_ABS, v)
+			} else {
+				o1, o2 = pfxadd(int16(p.To.Reg), REG_R0, PFX_R_PCREL, 0)
+				rel.Type = objabi.R_ADDRPOWER_PCREL34
+			}
+		}
+
 	case 27: /* subc ra,$simm,rd => subfic rd,ra,$simm */
 		v := c.regoff(p.GetFrom3())
 
@@ -3118,12 +3253,18 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 			r = c.getimpliedreg(&p.To, p)
 		}
 		// Offsets in DS form stores must be a multiple of 4
-		inst := c.opstore(p.As)
-		if c.opform(inst) == DS_FORM && v&0x3 != 0 {
-			log.Fatalf("invalid offset for DS form load/store %v", p)
+		if o.ispfx {
+			o1, o2 = pfxstore(p.As, p.From.Reg, int16(r), PFX_R_ABS)
+			o1 |= uint32((v >> 16) & 0x3FFFF)
+			o2 |= uint32(v & 0xFFFF)
+		} else {
+			inst := c.opstore(p.As)
+			if c.opform(inst) == DS_FORM && v&0x3 != 0 {
+				log.Fatalf("invalid offset for DS form load/store %v", p)
+			}
+			o1 = AOP_IRR(OP_ADDIS, REGTMP, uint32(r), uint32(high16adjusted(v)))
+			o2 = AOP_IRR(inst, uint32(p.From.Reg), REGTMP, uint32(v))
 		}
-		o1 = AOP_IRR(OP_ADDIS, REGTMP, uint32(r), uint32(high16adjusted(v)))
-		o2 = AOP_IRR(inst, uint32(p.From.Reg), REGTMP, uint32(v))
 
 	case 36: /* mov b/bz/h/hz lext/lauto/lreg,r ==> lbz+extsb/lbz/lha/lhz etc */
 		v := c.regoff(&p.From)
@@ -3132,8 +3273,15 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 		if r == 0 {
 			r = c.getimpliedreg(&p.From, p)
 		}
-		o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), uint32(r), uint32(high16adjusted(v)))
-		o2 = AOP_IRR(c.opload(p.As), uint32(p.To.Reg), uint32(p.To.Reg), uint32(v))
+
+		if o.ispfx {
+			o1, o2 = pfxload(p.As, p.To.Reg, int16(r), PFX_R_ABS)
+			o1 |= uint32((v >> 16) & 0x3FFFF)
+			o2 |= uint32(v & 0xFFFF)
+		} else {
+			o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), uint32(r), uint32(high16adjusted(v)))
+			o2 = AOP_IRR(c.opload(p.As), uint32(p.To.Reg), uint32(p.To.Reg), uint32(v))
+		}
 
 		// Sign extend MOVB if needed
 		o3 = LOP_RRR(OP_EXTSB, uint32(p.To.Reg), uint32(p.To.Reg), 0)
@@ -3484,23 +3632,28 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 
 	/* relocation operations */
 	case 74:
+		var rel *obj.Reloc
 		v := c.vregoff(&p.To)
 		// Offsets in DS form stores must be a multiple of 4
 		inst := c.opstore(p.As)
-		if c.opform(inst) == DS_FORM && v&0x3 != 0 {
+
+		// Can't reuse base for store instructions.
+		o1, o2, rel = c.symbolAccess(p.To.Sym, v, p.From.Reg, inst, false)
+
+		// Rewrite as a prefixed store if supported.
+		if o.ispfx {
+			o1, o2 = pfxstore(p.As, p.From.Reg, REG_R0, PFX_R_PCREL)
+			rel.Type = objabi.R_ADDRPOWER_PCREL34
+		} else if c.opform(inst) == DS_FORM && v&0x3 != 0 {
 			log.Fatalf("invalid offset for DS form load/store %v", p)
 		}
-		// Can't reuse base for store instructions.
-		o1, o2 = c.symbolAccess(p.To.Sym, v, p.From.Reg, inst, false)
 
 	case 75: // 32 bit offset symbol loads (got/toc/addr)
+		var rel *obj.Reloc
 		v := p.From.Offset
 
 		// Offsets in DS form loads must be a multiple of 4
 		inst := c.opload(p.As)
-		if c.opform(inst) == DS_FORM && v&0x3 != 0 {
-			log.Fatalf("invalid offset for DS form load/store %v", p)
-		}
 		switch p.From.Name {
 		case obj.NAME_GOTREF, obj.NAME_TOCREF:
 			if v != 0 {
@@ -3508,7 +3661,7 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 			}
 			o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R2, 0)
 			o2 = AOP_IRR(inst, uint32(p.To.Reg), uint32(p.To.Reg), 0)
-			rel := obj.Addrel(c.cursym)
+			rel = obj.Addrel(c.cursym)
 			rel.Off = int32(c.pc)
 			rel.Siz = 8
 			rel.Sym = p.From.Sym
@@ -3521,7 +3674,28 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 		default:
 			reuseBaseReg := p.As != AFMOVD && p.As != AFMOVS
 			// Reuse To.Reg as base register if not FP move.
-			o1, o2 = c.symbolAccess(p.From.Sym, v, p.To.Reg, inst, reuseBaseReg)
+			o1, o2, rel = c.symbolAccess(p.From.Sym, v, p.To.Reg, inst, reuseBaseReg)
+		}
+
+		// Convert to prefixed forms if supported.
+		if o.ispfx {
+			switch rel.Type {
+			case objabi.R_ADDRPOWER, objabi.R_ADDRPOWER_DS,
+				objabi.R_ADDRPOWER_TOCREL, objabi.R_ADDRPOWER_TOCREL_DS:
+				o1, o2 = pfxload(p.As, p.To.Reg, REG_R0, PFX_R_PCREL)
+				rel.Type = objabi.R_ADDRPOWER_PCREL34
+			case objabi.R_POWER_TLS_IE:
+				o1, o2 = pfxload(p.As, p.To.Reg, REG_R0, PFX_R_PCREL)
+				rel.Type = objabi.R_POWER_TLS_IE_PCREL34
+			case objabi.R_ADDRPOWER_GOT:
+				o1, o2 = pfxload(p.As, p.To.Reg, REG_R0, PFX_R_PCREL)
+				rel.Type = objabi.R_ADDRPOWER_GOT_PCREL34
+			default:
+				// We've failed to convert a TOC-relative relocation to a PC-relative one.
+				log.Fatalf("Unable convert TOC-relative relocation %v to PC-relative", rel.Type)
+			}
+		} else if c.opform(inst) == DS_FORM && v&0x3 != 0 {
+			log.Fatalf("invalid offset for DS form load/store %v", p)
 		}
 
 		o3 = LOP_RRR(OP_EXTSB, uint32(p.To.Reg), uint32(p.To.Reg), 0)
@@ -3530,26 +3704,36 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) {
 		if p.From.Offset != 0 {
 			c.ctxt.Diag("invalid offset against tls var %v", p)
 		}
-		o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R13, 0)
-		o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), uint32(p.To.Reg), 0)
 		rel := obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc)
 		rel.Siz = 8
 		rel.Sym = p.From.Sym
-		rel.Type = objabi.R_POWER_TLS_LE
+		if !o.ispfx {
+			o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R13, 0)
+			o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), uint32(p.To.Reg), 0)
+			rel.Type = objabi.R_POWER_TLS_LE
+		} else {
+			o1, o2 = pfxadd(p.To.Reg, REG_R13, PFX_R_ABS, 0)
+			rel.Type = objabi.R_POWER_TLS_LE_TPREL34
+		}
 
 	case 80:
 		if p.From.Offset != 0 {
 			c.ctxt.Diag("invalid offset against tls var %v", p)
 		}
-		o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R2, 0)
-		o2 = AOP_IRR(c.opload(AMOVD), uint32(p.To.Reg), uint32(p.To.Reg), 0)
-		o3 = AOP_RRR(OP_ADD, uint32(p.To.Reg), uint32(p.To.Reg), REG_R13)
 		rel := obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc)
 		rel.Siz = 8
 		rel.Sym = p.From.Sym
 		rel.Type = objabi.R_POWER_TLS_IE
+		if !o.ispfx {
+			o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R2, 0)
+			o2 = AOP_IRR(c.opload(AMOVD), uint32(p.To.Reg), uint32(p.To.Reg), 0)
+		} else {
+			o1, o2 = pfxload(p.As, p.To.Reg, REG_R0, PFX_R_PCREL)
+			rel.Type = objabi.R_POWER_TLS_IE_PCREL34
+		}
+		o3 = AOP_RRR(OP_ADD, uint32(p.To.Reg), uint32(p.To.Reg), REG_R13)
 		rel = obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc) + 8
 		rel.Siz = 4
diff --git a/src/cmd/internal/obj/ppc64/doc.go b/src/cmd/internal/obj/ppc64/doc.go
index 28340e425d..835182bcc6 100644
--- a/src/cmd/internal/obj/ppc64/doc.go
+++ b/src/cmd/internal/obj/ppc64/doc.go
@@ -250,5 +250,34 @@ Register names:
 	CRnGT represents CR bit 1 of CR field n. (0-7)
 	CRnEQ represents CR bit 2 of CR field n. (0-7)
 	CRnSO represents CR bit 3 of CR field n. (0-7)
+
+# GOPPC64 >= power10 and its effects on Go asm
+
+When GOPPC64=power10 is used to compile a Go program for ppc64le/linux, MOV*, FMOV*, and ADD
+opcodes which would require 2 or more machine instructions to emulate a 32 bit constant, or
+symbolic reference are implemented using prefixed instructions.
+
+A user who wishes granular control over the generated machine code is advised to use Go asm
+opcodes which explicitly translate to one PPC64 machine instruction. Most common opcodes
+are supported.
+
+Some examples of how pseudo-op assembly changes with GOPPC64:
+
+	Go asm                       GOPPC64 <= power9          GOPPC64 >= power10
+	MOVD mypackageÂ·foo(SB), R3   addis r2, r3, ...          pld r3, ...
+	                             ld    r3, r3, ...
+
+	MOVD 131072(R3), R4          addis r31, r4, 2           pld r4, 131072(r3)
+	                             ld    r4, 0(R3)
+
+	ADD $131073, R3              lis  r31, 2                paddi r3, r3, 131073
+	                             addi r31, 1
+	                             add  r3,r31,r3
+
+	MOVD $131073, R3             lis  r3, 2                 pli r3, 131073
+	                             addi r3, 1
+
+	MOVD $mypackageÂ·foo(SB), R3  addis r2, r3, ...          pla r3, ...
+	                             addi  r3, r3, ...
 */
 package ppc64
-- 
2.51.0