]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/obj/loong64: add [X]VLDREPL.{B/H/W/D} instructions support
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Thu, 19 Jun 2025 12:32:10 +0000 (20:32 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Mon, 4 Aug 2025 01:25:27 +0000 (18:25 -0700)
Go asm syntax:
 VMOVQ offset(Rj), Vd.<T>
XVMOVQ offset(Rj), Xd.<T>

<T> can have the following values:
B16, H8, W4, V2, B32, H16, W8, V4

Change-Id: I44af51d58bb62649d3fe360b3abb771565e78a8a
Reviewed-on: https://go-review.googlesource.com/c/go/+/682895
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <mark@golang.org>
src/cmd/asm/internal/asm/testdata/loong64enc1.s
src/cmd/internal/obj/loong64/asm.go
src/cmd/internal/obj/loong64/doc.go

index bfff555782e9f7f6f978e3ec05e9701c5565f533..dfb2a2f1772079ce5f443d38f9ebc0ad80accaf2 100644 (file)
@@ -510,6 +510,16 @@ lable2:
        VMOVQ           V3.W[1], V7.W4  // 67e4f772
        VMOVQ           V4.V[0], V6.V2  // 86f0f772
 
+       // Load data from memory and broadcast to each element of a vector register: VMOVQ    offset(Rj), <Vd>.<T>
+       VMOVQ           (R4), V0.B16    // 80008030
+       VMOVQ           1(R4), V1.H8    // 81044030
+       VMOVQ           2(R4), V2.W4    // 82082030
+       VMOVQ           3(R4), V3.V2    // 830c1030
+       XVMOVQ          (R4), X0.B32    // 80008032
+       XVMOVQ          1(R4), X1.H16   // 81044032
+       XVMOVQ          2(R4), X2.W8    // 82082032
+       XVMOVQ          3(R4), X3.V4    // 830c1032
+
        // VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
        VSEQB           V1, V2, V3      // 43040070
        VSEQH           V1, V2, V3      // 43840070
index 6e09930183383ca2834a89b13b70484592619b45..d6e5a3f476b1f90b8d90c4db7eee9b1c605566d5 100644 (file)
@@ -416,8 +416,11 @@ var optab = []Optab{
 
        {AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0},
 
-       {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 46, 4, 0, 0},
-       {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 47, 20, 0, 0},
+       {AVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
+       {AXVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
+
+       {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 47, 4, 0, 0},
+       {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 48, 20, 0, 0},
 
        {obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
        {obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0},
@@ -2395,7 +2398,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = uint32(c.regoff(&p.From))
 
        case 39: // vmov Rn, Vd.<T>[index]
-               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2407,7 +2410,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (index << 10) | (Rj << 5) | Vd
 
        case 40: // vmov Vd.<T>[index], Rn
-               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2419,7 +2422,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (index << 10) | (Vj << 5) | Rd
 
        case 41: // vmov Rn, Vd.<T>
-               v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2429,7 +2432,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (Rj << 5) | Vd
 
        case 42: // vmov  xj, xd.<T>
-               v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2439,7 +2442,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (Xj << 5) | Xd
 
        case 43: // vmov  xj, xd.<T>[index]
-               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2451,7 +2454,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (index << 10) | (Xj << 5) | Xd
 
        case 44: // vmov  xj.<T>[index], xd
-               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2463,7 +2466,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                o1 = v | (index << 10) | (Xj << 5) | Xd
 
        case 45: // vmov  vj.<T>[index], vd.<T>
-               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+               v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
                if v == 0 {
                        c.ctxt.Diag("illegal arng type combination: %v\n", p)
                }
@@ -2474,12 +2477,23 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                c.checkindex(p, index, m)
                o1 = v | (index << 10) | (vj << 5) | vd
 
-       case 46: // preld  offset(Rbase), $hint
+       case 46: // vmov offset(vj), vd.<T>
+               v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, true)
+               if v == 0 {
+                       c.ctxt.Diag("illegal arng type combination: %v\n", p)
+               }
+
+               si := c.regoff(&p.From)
+               Rj := uint32(p.From.Reg & EXT_REG_MASK)
+               Vd := uint32(p.To.Reg & EXT_REG_MASK)
+               o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+
+       case 47: // preld  offset(Rbase), $hint
                offs := c.regoff(&p.From)
                hint := p.GetFrom3().Offset
                o1 = OP_12IR_5I(c.opiir(p.As), uint32(offs), uint32(p.From.Reg), uint32(hint))
 
-       case 47: // preldx offset(Rbase), $n, $hint
+       case 48: // preldx offset(Rbase), $n, $hint
                offs := c.regoff(&p.From)
                hint := p.RestArgs[1].Offset
                n := uint64(p.GetFrom3().Offset)
@@ -4192,7 +4206,7 @@ func (c *ctxt0) specialFpMovInst(a obj.As, fclass int, tclass int) uint32 {
        return 0
 }
 
-func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_mask uint32) {
+func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16, offset_flag bool) (op_code, index_mask uint32) {
        farng := (fReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
        tarng := (tReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
        fclass := c.rclass(fReg)
@@ -4258,29 +4272,58 @@ func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_ma
                }
 
        case C_REG | (C_ARNG << 16):
-               // vmov Rn, Vd.<T>
-               switch a {
-               case AVMOVQ:
-                       switch tarng {
-                       case ARNG_16B:
-                               return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
-                       case ARNG_8H:
-                               return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
-                       case ARNG_4W:
-                               return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
-                       case ARNG_2V:
-                               return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
+               switch {
+               case offset_flag:
+                       // vmov offset(vj), vd.<T>
+                       switch a {
+                       case AVMOVQ:
+                               switch tarng {
+                               case ARNG_16B:
+                                       return (0xC2 << 22), 0x0 // vldrepl.b
+                               case ARNG_8H:
+                                       return (0x182 << 21), 0x0 // vldrepl.h
+                               case ARNG_4W:
+                                       return (0x302 << 20), 0x0 // vldrepl.w
+                               case ARNG_2V:
+                                       return (0x602 << 19), 0x0 // vldrepl.d
+                               }
+                       case AXVMOVQ:
+                               switch tarng {
+                               case ARNG_32B:
+                                       return (0xCA << 22), 0x0 // xvldrepl.b
+                               case ARNG_16H:
+                                       return (0x192 << 21), 0x0 // xvldrepl.h
+                               case ARNG_8W:
+                                       return (0x322 << 20), 0x0 // xvldrepl.w
+                               case ARNG_4V:
+                                       return (0x642 << 19), 0x0 // xvldrepl.d
+                               }
                        }
-               case AXVMOVQ:
-                       switch tarng {
-                       case ARNG_32B:
-                               return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
-                       case ARNG_16H:
-                               return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
-                       case ARNG_8W:
-                               return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
-                       case ARNG_4V:
-                               return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
+               default:
+                       // vmov Rn, Vd.<T>
+                       switch a {
+                       case AVMOVQ:
+                               switch tarng {
+                               case ARNG_16B:
+                                       return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
+                               case ARNG_8H:
+                                       return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
+                               case ARNG_4W:
+                                       return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
+                               case ARNG_2V:
+                                       return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
+                               }
+                       case AXVMOVQ:
+                               switch tarng {
+                               case ARNG_32B:
+                                       return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
+                               case ARNG_16H:
+                                       return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
+                               case ARNG_8W:
+                                       return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
+                               case ARNG_4V:
+                                       return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
+                               }
                        }
                }
 
index 0818389c8d9366f2706a709254f7529fd85a4790..a990b230892623603d74350dc227b095a03dbaa7 100644 (file)
@@ -203,6 +203,23 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
        VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
        VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
 
+3.7 Load data from memory and broadcast to each element of a vector register.
+
+       Instruction format:
+               VMOVQ    offset(Rj), <Vd>.<T>
+
+       Mapping between Go and platform assembly:
+          Go assembly              |     platform assembly      |                                semantics
+       -------------------------------------------------------------------------------------------------------------------------------------------------------
+        VMOVQ  offset(Rj), Vd.B16  |   vldrepl.b  Vd, Rj, si12  |  for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
+        VMOVQ  offset(Rj), Vd.H8   |   vldrepl.h  Vd, Rj, si11  |  for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
+        VMOVQ  offset(Rj), Vd.W4   |   vldrepl.w  Vd, Rj, si10  |  for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
+        VMOVQ  offset(Rj), Vd.V2   |   vldrepl.d  Vd, Rj, si9   |  for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
+       XVMOVQ  offset(Rj), Xd.B32  |  xvldrepl.b  Xd, Rj, si12  |  for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
+       XVMOVQ  offset(Rj), Xd.H16  |  xvldrepl.h  Xd, Rj, si11  |  for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
+       XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
+       XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
+
 # Special instruction encoding definition and description on LoongArch
 
  1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased