From 5e1d0fcbed3060b00c33e48ae6fdbaa92965c287 Mon Sep 17 00:00:00 2001
From: Michael Hudson-Doyle <michael.hudson@canonical.com>
Date: Tue, 8 Sep 2015 15:21:58 +1200
Subject: [PATCH] cmd/internal/obj, cmd/link: handle the fact that a few
 store/loads on ppc64 are DS form

Change-Id: I4fe1af48ec1cd8a23e2f7f2a0257dc989ff7aced
Reviewed-on: https://go-review.googlesource.com/14235
Reviewed-by: Russ Cox <rsc@golang.org>
---
 src/cmd/internal/obj/link.go       | 12 ++++
 src/cmd/internal/obj/ppc64/asm9.go | 66 ++++++++++++++++------
 src/cmd/link/internal/ppc64/asm.go | 91 +++++++++++++++++-------------
 3 files changed, 113 insertions(+), 56 deletions(-)

diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
index ef63a7047b..367dc1247f 100644
--- a/src/cmd/internal/obj/link.go
+++ b/src/cmd/internal/obj/link.go
@@ -404,6 +404,11 @@ type Reloc struct {
 // Reloc.type
 const (
 	R_ADDR = 1 + iota
+	// R_ADDRPOWER relocates a pair of "D-form" instructions (instructions with 16-bit
+	// immediates in the low half of the instruction word), usually addis followed by
+	// another add or a load, inserting the "high adjusted" 16 bits of the address of
+	// the referenced symbol into the immediate field of the first instruction and the
+	// low 16 bits into that of the second instruction.
 	R_ADDRPOWER
 	R_ADDRARM64
 	R_SIZE
@@ -459,6 +464,13 @@ const (
 	// thread pointer (R13) and inserts this value into the low 16 bits of an
 	// instruction word.
 	R_POWER_TLS_LE
+
+	// R_ADDRPOWER_DS is similar to R_ADDRPOWER above, but assumes the second
+	// instruction is a "DS-form" instruction, which has an immediate field occupying
+	// bits [15:2] of the instruction word. Bits [15:2] of the address of the
+	// relocated symbol are inserted into this field; it is an error if the last two
+	// bits of the address are not 0.
+	R_ADDRPOWER_DS
 )
 
 type Auto struct {
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index 7ade888be7..3e3a020a7c 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -1377,14 +1377,57 @@ func oclass(a *obj.Addr) int {
 	return int(a.Class) - 1
 }
 
-// add R_ADDRPOWER relocation to symbol s with addend d
-func addaddrreloc(ctxt *obj.Link, s *obj.LSym, d int64) {
+const (
+	D_FORM = iota
+	DS_FORM
+)
+
+// opform returns the form (D_FORM or DS_FORM) of an instruction. Used to decide on
+// which relocation to use with a load or store and only supports the needed
+// instructions.
+func opform(ctxt *obj.Link, insn int32) int {
+	switch uint32(insn) {
+	default:
+		ctxt.Diag("bad insn in loadform: %x", insn)
+	case OPVCC(58, 0, 0, 0), // ld
+		OPVCC(58, 0, 0, 0) | 1<<1, // lwa
+		OPVCC(62, 0, 0, 0):        // std
+		return DS_FORM
+	case OP_ADDI, // add
+		OPVCC(32, 0, 0, 0), // lwz
+		OPVCC(42, 0, 0, 0), // lha
+		OPVCC(40, 0, 0, 0), // lhz
+		OPVCC(34, 0, 0, 0), // lbz
+		OPVCC(50, 0, 0, 0), // lfd
+		OPVCC(48, 0, 0, 0), // lfs
+		OPVCC(36, 0, 0, 0), // stw
+		OPVCC(44, 0, 0, 0), // sth
+		OPVCC(38, 0, 0, 0), // stb
+		OPVCC(54, 0, 0, 0), // stfd
+		OPVCC(52, 0, 0, 0): // stfs
+		return D_FORM
+	}
+	return 0
+}
+
+// Encode instructions and create relocation for accessing s+d according to the
+// instruction op with source or destination (as appropriate) register reg.
+func symbolAccess(ctxt *obj.Link, s *obj.LSym, d int64, reg int16, op int32) (o1, o2 uint32) {
+	form := opform(ctxt, op)
+	o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, 0)
+	o2 = AOP_IRR(uint32(op), uint32(reg), REGTMP, 0)
 	rel := obj.Addrel(ctxt.Cursym)
 	rel.Off = int32(ctxt.Pc)
 	rel.Siz = 8
 	rel.Sym = s
 	rel.Add = d
-	rel.Type = obj.R_ADDRPOWER
+	switch form {
+	case D_FORM:
+		rel.Type = obj.R_ADDRPOWER
+	case DS_FORM:
+		rel.Type = obj.R_ADDRPOWER_DS
+	}
+	return
 }
 
 /*
@@ -1810,9 +1853,7 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 			o1 = loadu32(int(p.To.Reg), d)
 			o2 = LOP_IRR(OP_ORI, uint32(p.To.Reg), uint32(p.To.Reg), uint32(int32(d)))
 		} else {
-			o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, 0)
-			o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), REGTMP, 0)
-			addaddrreloc(ctxt, p.From.Sym, d)
+			o1, o2 = symbolAccess(ctxt, p.From.Sym, d, p.To.Reg, OP_ADDI)
 		}
 
 	//if(dlm) reloc(&p->from, p->pc, 0);
@@ -2377,26 +2418,19 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
 	/* relocation operations */
 	case 74:
 		v := vregoff(ctxt, &p.To)
-
-		o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, 0)
-		o2 = AOP_IRR(uint32(opstore(ctxt, int(p.As))), uint32(p.From.Reg), REGTMP, 0)
-		addaddrreloc(ctxt, p.To.Sym, v)
+		o1, o2 = symbolAccess(ctxt, p.To.Sym, v, p.From.Reg, opstore(ctxt, int(p.As)))
 
 	//if(dlm) reloc(&p->to, p->pc, 1);
 
 	case 75:
 		v := vregoff(ctxt, &p.From)
-		o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, 0)
-		o2 = AOP_IRR(uint32(opload(ctxt, int(p.As))), uint32(p.To.Reg), REGTMP, 0)
-		addaddrreloc(ctxt, p.From.Sym, v)
+		o1, o2 = symbolAccess(ctxt, p.From.Sym, v, p.To.Reg, opload(ctxt, int(p.As)))
 
 	//if(dlm) reloc(&p->from, p->pc, 1);
 
 	case 76:
 		v := vregoff(ctxt, &p.From)
-		o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, 0)
-		o2 = AOP_IRR(uint32(opload(ctxt, int(p.As))), uint32(p.To.Reg), REGTMP, 0)
-		addaddrreloc(ctxt, p.From.Sym, v)
+		o1, o2 = symbolAccess(ctxt, p.From.Sym, v, p.To.Reg, opload(ctxt, int(p.As)))
 		o3 = LOP_RRR(OP_EXTSB, uint32(p.To.Reg), uint32(p.To.Reg), 0)
 
 		//if(dlm) reloc(&p->from, p->pc, 1);
diff --git a/src/cmd/link/internal/ppc64/asm.go b/src/cmd/link/internal/ppc64/asm.go
index b8ab534461..e6dbec23eb 100644
--- a/src/cmd/link/internal/ppc64/asm.go
+++ b/src/cmd/link/internal/ppc64/asm.go
@@ -330,6 +330,55 @@ func symtoc(s *ld.LSym) int64 {
 	return toc.Value
 }
 
+func archrelocaddr(r *ld.Reloc, s *ld.LSym, val *int64) int {
+	var o1, o2 uint32
+	if ld.Ctxt.Arch.ByteOrder == binary.BigEndian {
+		o1 = uint32(*val >> 32)
+		o2 = uint32(*val)
+	} else {
+		o1 = uint32(*val)
+		o2 = uint32(*val >> 32)
+	}
+
+	// We are spreading a 31-bit address across two instructions, putting the
+	// high (adjusted) part in the low 16 bits of the first instruction and the
+	// low part in the low 16 bits of the second instruction, or, in the DS case,
+	// bits 15-2 (inclusive) of the address into bits 15-2 of the second
+	// instruction (it is an error in this case if the low 2 bits of the address
+	// are non-zero).
+
+	t := ld.Symaddr(r.Sym) + r.Add
+	if t < 0 || t >= 1<<31 {
+		ld.Ctxt.Diag("relocation for %s is too big (>=2G): %d", s.Name, ld.Symaddr(r.Sym))
+	}
+	if t&0x8000 != 0 {
+		t += 0x10000
+	}
+
+	switch r.Type {
+	case obj.R_ADDRPOWER:
+		o1 |= (uint32(t) >> 16) & 0xffff
+		o2 |= uint32(t) & 0xffff
+
+	case obj.R_ADDRPOWER_DS:
+		o1 |= (uint32(t) >> 16) & 0xffff
+		if t&3 != 0 {
+			ld.Ctxt.Diag("bad DS reloc for %s: %d", s.Name, ld.Symaddr(r.Sym))
+		}
+		o2 |= uint32(t) & 0xfffc
+
+	default:
+		return -1
+	}
+
+	if ld.Ctxt.Arch.ByteOrder == binary.BigEndian {
+		*val = int64(o1)<<32 | int64(o2)
+	} else {
+		*val = int64(o2)<<32 | int64(o1)
+	}
+	return 0
+}
+
 func archreloc(r *ld.Reloc, s *ld.LSym, val *int64) int {
 	if ld.Linkmode == ld.LinkExternal {
 		// TODO(minux): translate R_ADDRPOWER and R_CALLPOWER into standard ELF relocations.
@@ -347,46 +396,8 @@ func archreloc(r *ld.Reloc, s *ld.LSym, val *int64) int {
 		*val = ld.Symaddr(r.Sym) + r.Add - ld.Symaddr(ld.Linklookup(ld.Ctxt, ".got", 0))
 		return 0
 
-	case obj.R_ADDRPOWER:
-		// We are spreading a 31-bit address across two instructions,
-		// putting the high (adjusted) part in the low 16 bits of the
-		// first instruction and the low part in the low 16 bits of the
-		// second instruction.
-		t := ld.Symaddr(r.Sym) + r.Add
-		if t < 0 || t >= 1<<31 {
-			ld.Ctxt.Diag("relocation for %s is too big (>=2G): %d", s.Name, ld.Symaddr(r.Sym))
-		}
-		var o1, o2 uint32
-		if ld.Ctxt.Arch.ByteOrder == binary.BigEndian {
-			o1 = uint32(*val >> 32)
-			o2 = uint32(*val)
-		} else {
-			o1 = uint32(*val)
-			o2 = uint32(*val >> 32)
-		}
-		if t&0x8000 != 0 {
-			t += 0x10000
-		}
-		// There is an almost-bug here. When R_ADDRPOWER is relocating a
-		// load, the two instructions are addi and then a load. addi and
-		// almost all loads are "D-form" instructions, which have a
-		// 16-bit immediate in the lower 16-bits of the instruction
-		// word. But the load doubleword instruction is a "DS-form"
-		// instruction: the immediate only occupies bits 16-29 of the
-		// instruction and is implicity padded with zeros on the
-		// right. The reason the belows isn't a bug is because we only
-		// ever use immediates that have zeros on in their lower bits
-		// with ld, and we combine the immediate with | so bits 30 and
-		// 31 are preserved.
-		o1 |= (uint32(t) >> 16) & 0xffff
-		o2 |= uint32(t) & 0xffff
-
-		if ld.Ctxt.Arch.ByteOrder == binary.BigEndian {
-			*val = int64(o1)<<32 | int64(o2)
-		} else {
-			*val = int64(o2)<<32 | int64(o1)
-		}
-		return 0
+	case obj.R_ADDRPOWER, obj.R_ADDRPOWER_DS:
+		return archrelocaddr(r, s, val)
 
 	case obj.R_CALLPOWER:
 		// Bits 6 through 29 = (S + A - P) >> 2
-- 
2.52.0