From d44749b65b47f42e7a5bae2e0e9b0ab6bb3d5f80 Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Thu, 19 Jun 2025 20:32:10 +0800 Subject: [PATCH] cmd/internal/obj/loong64: add [X]VLDREPL.{B/H/W/D} instructions support Go asm syntax: VMOVQ offset(Rj), Vd. XVMOVQ offset(Rj), Xd. can have the following values: B16, H8, W4, V2, B32, H16, W8, V4 Change-Id: I44af51d58bb62649d3fe360b3abb771565e78a8a Reviewed-on: https://go-review.googlesource.com/c/go/+/682895 Reviewed-by: abner chenc Reviewed-by: Michael Knyszek Reviewed-by: Meidan Li LUCI-TryBot-Result: Go LUCI Reviewed-by: Mark Freeman --- .../asm/internal/asm/testdata/loong64enc1.s | 10 ++ src/cmd/internal/obj/loong64/asm.go | 111 ++++++++++++------ src/cmd/internal/obj/loong64/doc.go | 17 +++ 3 files changed, 104 insertions(+), 34 deletions(-) diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s index bfff555782..dfb2a2f177 100644 --- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s +++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s @@ -510,6 +510,16 @@ lable2: VMOVQ V3.W[1], V7.W4 // 67e4f772 VMOVQ V4.V[0], V6.V2 // 86f0f772 + // Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), . + VMOVQ (R4), V0.B16 // 80008030 + VMOVQ 1(R4), V1.H8 // 81044030 + VMOVQ 2(R4), V2.W4 // 82082030 + VMOVQ 3(R4), V3.V2 // 830c1030 + XVMOVQ (R4), X0.B32 // 80008032 + XVMOVQ 1(R4), X1.H16 // 81044032 + XVMOVQ 2(R4), X2.W8 // 82082032 + XVMOVQ 3(R4), X3.V4 // 830c1032 + // VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction VSEQB V1, V2, V3 // 43040070 VSEQH V1, V2, V3 // 43840070 diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go index 6e09930183..d6e5a3f476 100644 --- a/src/cmd/internal/obj/loong64/asm.go +++ b/src/cmd/internal/obj/loong64/asm.go @@ -416,8 +416,11 @@ var optab = []Optab{ {AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0}, - {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 46, 4, 0, 0}, - {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 47, 20, 0, 0}, + {AVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0}, + {AXVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0}, + + {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 47, 4, 0, 0}, + {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 48, 20, 0, 0}, {obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, {obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0}, @@ -2395,7 +2398,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = uint32(c.regoff(&p.From)) case 39: // vmov Rn, Vd.[index] - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2407,7 +2410,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Rj << 5) | Vd case 40: // vmov Vd.[index], Rn - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2419,7 +2422,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Vj << 5) | Rd case 41: // vmov Rn, Vd. - v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2429,7 +2432,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (Rj << 5) | Vd case 42: // vmov xj, xd. - v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2439,7 +2442,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (Xj << 5) | Xd case 43: // vmov xj, xd.[index] - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2451,7 +2454,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Xj << 5) | Xd case 44: // vmov xj.[index], xd - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2463,7 +2466,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Xj << 5) | Xd case 45: // vmov vj.[index], vd. - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2474,12 +2477,23 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { c.checkindex(p, index, m) o1 = v | (index << 10) | (vj << 5) | vd - case 46: // preld offset(Rbase), $hint + case 46: // vmov offset(vj), vd. + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, true) + if v == 0 { + c.ctxt.Diag("illegal arng type combination: %v\n", p) + } + + si := c.regoff(&p.From) + Rj := uint32(p.From.Reg & EXT_REG_MASK) + Vd := uint32(p.To.Reg & EXT_REG_MASK) + o1 = v | uint32(si<<10) | (Rj << 5) | Vd + + case 47: // preld offset(Rbase), $hint offs := c.regoff(&p.From) hint := p.GetFrom3().Offset o1 = OP_12IR_5I(c.opiir(p.As), uint32(offs), uint32(p.From.Reg), uint32(hint)) - case 47: // preldx offset(Rbase), $n, $hint + case 48: // preldx offset(Rbase), $n, $hint offs := c.regoff(&p.From) hint := p.RestArgs[1].Offset n := uint64(p.GetFrom3().Offset) @@ -4192,7 +4206,7 @@ func (c *ctxt0) specialFpMovInst(a obj.As, fclass int, tclass int) uint32 { return 0 } -func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_mask uint32) { +func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16, offset_flag bool) (op_code, index_mask uint32) { farng := (fReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK tarng := (tReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK fclass := c.rclass(fReg) @@ -4258,29 +4272,58 @@ func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_ma } case C_REG | (C_ARNG << 16): - // vmov Rn, Vd. - switch a { - case AVMOVQ: - switch tarng { - case ARNG_16B: - return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b - case ARNG_8H: - return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h - case ARNG_4W: - return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w - case ARNG_2V: - return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d + switch { + case offset_flag: + // vmov offset(vj), vd. + switch a { + case AVMOVQ: + switch tarng { + case ARNG_16B: + return (0xC2 << 22), 0x0 // vldrepl.b + case ARNG_8H: + return (0x182 << 21), 0x0 // vldrepl.h + case ARNG_4W: + return (0x302 << 20), 0x0 // vldrepl.w + case ARNG_2V: + return (0x602 << 19), 0x0 // vldrepl.d + } + case AXVMOVQ: + switch tarng { + case ARNG_32B: + return (0xCA << 22), 0x0 // xvldrepl.b + case ARNG_16H: + return (0x192 << 21), 0x0 // xvldrepl.h + case ARNG_8W: + return (0x322 << 20), 0x0 // xvldrepl.w + case ARNG_4V: + return (0x642 << 19), 0x0 // xvldrepl.d + } } - case AXVMOVQ: - switch tarng { - case ARNG_32B: - return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b - case ARNG_16H: - return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h - case ARNG_8W: - return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w - case ARNG_4V: - return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d + default: + // vmov Rn, Vd. + switch a { + case AVMOVQ: + switch tarng { + case ARNG_16B: + return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b + case ARNG_8H: + return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h + case ARNG_4W: + return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w + case ARNG_2V: + return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d + } + case AXVMOVQ: + switch tarng { + case ARNG_32B: + return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b + case ARNG_16H: + return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h + case ARNG_8W: + return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w + case ARNG_4V: + return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d + } } } diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go index 0818389c8d..a990b23089 100644 --- a/src/cmd/internal/obj/loong64/doc.go +++ b/src/cmd/internal/obj/loong64/doc.go @@ -203,6 +203,23 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate) VMOVQ Vj.W[index], Vd.W4 | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2] VMOVQ Vj.V[index], Vd.V2 | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1] +3.7 Load data from memory and broadcast to each element of a vector register. + + Instruction format: + VMOVQ offset(Rj), . + + Mapping between Go and platform assembly: + Go assembly | platform assembly | semantics + ------------------------------------------------------------------------------------------------------------------------------------------------------- + VMOVQ offset(Rj), Vd.B16 | vldrepl.b Vd, Rj, si12 | for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12)) + VMOVQ offset(Rj), Vd.H8 | vldrepl.h Vd, Rj, si11 | for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1)) + VMOVQ offset(Rj), Vd.W4 | vldrepl.w Vd, Rj, si10 | for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2)) + VMOVQ offset(Rj), Vd.V2 | vldrepl.d Vd, Rj, si9 | for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3)) + XVMOVQ offset(Rj), Xd.B32 | xvldrepl.b Xd, Rj, si12 | for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12)) + XVMOVQ offset(Rj), Xd.H16 | xvldrepl.h Xd, Rj, si11 | for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1)) + XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2)) + XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3)) + # Special instruction encoding definition and description on LoongArch 1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased -- 2.51.0