]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL...
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Fri, 29 Aug 2025 08:20:16 +0000 (16:20 +0800)
committerGopher Robot <gobot@golang.org>
Thu, 4 Sep 2025 16:22:33 +0000 (09:22 -0700)
The previously defined usage of offset was ambiguous and not easy to understand.
For example, to fetch 4 bytes of data from the address base+8 and
broadcast it to each word element of vector register V5, the assembly
implementation is as follows:
previous: VMOVQ 2(base), V5.W4
current:  VMOVQ 8(base), V5.W4

Change-Id: I8bc84e35033ab63bd10f4c61618789f94314f78c
Reviewed-on: https://go-review.googlesource.com/c/go/+/699875
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/asm/internal/asm/testdata/loong64enc1.s
src/cmd/internal/obj/loong64/asm.go
src/cmd/internal/obj/loong64/doc.go
src/internal/chacha8rand/chacha8_loong64.s

index 63676cc785967c74af2c736bba93a1b9690700d5..c5c6a4479a40601b347fac3938024d569d801343 100644 (file)
@@ -538,13 +538,29 @@ lable2:
 
        // Load data from memory and broadcast to each element of a vector register: VMOVQ    offset(Rj), <Vd>.<T>
        VMOVQ           (R4), V0.B16    // 80008030
-       VMOVQ           1(R4), V1.H8    // 81044030
-       VMOVQ           2(R4), V2.W4    // 82082030
-       VMOVQ           3(R4), V3.V2    // 830c1030
+       VMOVQ           1(R4), V0.B16   // 80048030
+       VMOVQ           -3(R4), V0.B16  // 80f4bf30
+       VMOVQ           (R4), V1.H8     // 81004030
+       VMOVQ           2(R4), V1.H8    // 81044030
+       VMOVQ           -6(R4), V1.H8   // 81f45f30
+       VMOVQ           (R4), V2.W4     // 82002030
+       VMOVQ           8(R4), V2.W4    // 82082030
+       VMOVQ           -12(R4), V2.W4  // 82f42f30
+       VMOVQ           (R4), V3.V2     // 83001030
+       VMOVQ           24(R4), V3.V2   // 830c1030
+       VMOVQ           -16(R4), V3.V2  // 83f81730
        XVMOVQ          (R4), X0.B32    // 80008032
-       XVMOVQ          1(R4), X1.H16   // 81044032
-       XVMOVQ          2(R4), X2.W8    // 82082032
-       XVMOVQ          3(R4), X3.V4    // 830c1032
+       XVMOVQ          1(R4), X0.B32   // 80048032
+       XVMOVQ          -5(R4), X0.B32  // 80ecbf32
+       XVMOVQ          (R4), X1.H16    // 81004032
+       XVMOVQ          2(R4), X1.H16   // 81044032
+       XVMOVQ          -10(R4), X1.H16 // 81ec5f32
+       XVMOVQ          (R4), X2.W8     // 82002032
+       XVMOVQ          8(R4), X2.W8    // 82082032
+       XVMOVQ          -20(R4), X2.W8  // 82ec2f32
+       XVMOVQ          (R4), X3.V4     // 83001032
+       XVMOVQ          24(R4), X3.V4   // 830c1032
+       XVMOVQ          -24(R4), X3.V4  // 83f41732
 
        // VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
        VSEQB           V1, V2, V3      // 43040070
index 1b982f6c86fa53b2c0d11d1aed647179b1470944..35b33b937684285aa3bdcb38187d34c26c8ffca4 100644 (file)
@@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
        return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
 }
 
+func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
 func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
        return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
 }
@@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                si := c.regoff(&p.From)
                Rj := uint32(p.From.Reg & EXT_REG_MASK)
                Vd := uint32(p.To.Reg & EXT_REG_MASK)
-               o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+               switch v & 0xc00000 {
+               case 0x800000: // [x]vldrepl.b
+                       o1 = OP_12IRR(v, uint32(si), Rj, Vd)
+               case 0x400000: // [x]vldrepl.h
+                       if si&1 != 0 {
+                               c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
+                       }
+                       o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
+               case 0x0:
+                       switch v & 0x300000 {
+                       case 0x200000: // [x]vldrepl.w
+                               if si&3 != 0 {
+                                       c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+                               }
+                               o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
+                       case 0x100000: // [x]vldrepl.d
+                               if si&7 != 0 {
+                                       c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
+                               }
+                               o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
+                       }
+               }
 
        case 47: // preld  offset(Rbase), $hint
                offs := c.regoff(&p.From)
index 6c8f2618a2cb73c326f2733b2e8c2019d8c63fa1..20c5a9e0a6faa8fc02aa7464dab3ab7619557b72 100644 (file)
@@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
        XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
        XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
 
+       note: In Go assembly, for ease of understanding, offset representing the actual address offset.
+             However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
+
+          Go assembly           |      platform assembly
+         VMOVQ  1(R4), V5.B16    |      vldrepl.b  v5, r4, $1
+         VMOVQ  2(R4), V5.H8     |      vldrepl.h  v5, r4, $1
+         VMOVQ  8(R4), V5.W4     |      vldrepl.w  v5, r4, $2
+         VMOVQ  8(R4), V5.V2     |      vldrepl.d  v5, r4, $1
+
 # Special instruction encoding definition and description on LoongArch
 
  1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
index 5e6857ed3a65982f847c3446e3855a0f28361198..73a1e5bf05f659ec14020eab8180f1bbdbecc65c 100644 (file)
@@ -50,22 +50,22 @@ lsx_chacha8:
 
        // load contants
        VMOVQ   (R10), V0.W4
-       VMOVQ   1(R10), V1.W4
-       VMOVQ   2(R10), V2.W4
-       VMOVQ   3(R10), V3.W4
+       VMOVQ   4(R10), V1.W4
+       VMOVQ   8(R10), V2.W4
+       VMOVQ   12(R10), V3.W4
 
        // load 4-32bit data from incRotMatrix added to counter
        VMOVQ   (R11), V30
 
        // load seed
        VMOVQ   (R4), V4.W4
-       VMOVQ   1(R4), V5.W4
-       VMOVQ   2(R4), V6.W4
-       VMOVQ   3(R4), V7.W4
-       VMOVQ   4(R4), V8.W4
-       VMOVQ   5(R4), V9.W4
-       VMOVQ   6(R4), V10.W4
-       VMOVQ   7(R4), V11.W4
+       VMOVQ   4(R4), V5.W4
+       VMOVQ   8(R4), V6.W4
+       VMOVQ   12(R4), V7.W4
+       VMOVQ   16(R4), V8.W4
+       VMOVQ   20(R4), V9.W4
+       VMOVQ   24(R4), V10.W4
+       VMOVQ   28(R4), V11.W4
 
        // load counter and update counter
        VMOVQ   R6, V12.W4