cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL...

author Xiaolin Zhao <zhaoxiaolin@loongson.cn>

Fri, 29 Aug 2025 08:20:16 +0000 (16:20 +0800)

committer Gopher Robot <gobot@golang.org>

Thu, 4 Sep 2025 16:22:33 +0000 (09:22 -0700)
author Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Fri, 29 Aug 2025 08:20:16 +0000 (16:20 +0800)
committer Gopher Robot <gobot@golang.org>
Thu, 4 Sep 2025 16:22:33 +0000 (09:22 -0700)
diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s

index 63676cc785967c74af2c736bba93a1b9690700d5..c5c6a4479a40601b347fac3938024d569d801343 100644 (file)
--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
@@ -538,13 +538,29 @@ lable2:
  
         // Load data from memory and broadcast to each element of a vector register: VMOVQ    offset(Rj), <Vd>.<T>
         VMOVQ           (R4), V0.B16    // 80008030
-       VMOVQ           1(R4), V1.H8    // 81044030
-       VMOVQ           2(R4), V2.W4    // 82082030
-       VMOVQ           3(R4), V3.V2    // 830c1030
+       VMOVQ           1(R4), V0.B16   // 80048030
+       VMOVQ           -3(R4), V0.B16  // 80f4bf30
+       VMOVQ           (R4), V1.H8     // 81004030
+       VMOVQ           2(R4), V1.H8    // 81044030
+       VMOVQ           -6(R4), V1.H8   // 81f45f30
+       VMOVQ           (R4), V2.W4     // 82002030
+       VMOVQ           8(R4), V2.W4    // 82082030
+       VMOVQ           -12(R4), V2.W4  // 82f42f30
+       VMOVQ           (R4), V3.V2     // 83001030
+       VMOVQ           24(R4), V3.V2   // 830c1030
+       VMOVQ           -16(R4), V3.V2  // 83f81730
         XVMOVQ          (R4), X0.B32    // 80008032
-       XVMOVQ          1(R4), X1.H16   // 81044032
-       XVMOVQ          2(R4), X2.W8    // 82082032
-       XVMOVQ          3(R4), X3.V4    // 830c1032
+       XVMOVQ          1(R4), X0.B32   // 80048032
+       XVMOVQ          -5(R4), X0.B32  // 80ecbf32
+       XVMOVQ          (R4), X1.H16    // 81004032
+       XVMOVQ          2(R4), X1.H16   // 81044032
+       XVMOVQ          -10(R4), X1.H16 // 81ec5f32
+       XVMOVQ          (R4), X2.W8     // 82002032
+       XVMOVQ          8(R4), X2.W8    // 82082032
+       XVMOVQ          -20(R4), X2.W8  // 82ec2f32
+       XVMOVQ          (R4), X3.V4     // 83001032
+       XVMOVQ          24(R4), X3.V4   // 830c1032
+       XVMOVQ          -24(R4), X3.V4  // 83f41732
  
         // VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
         VSEQB           V1, V2, V3      // 43040070
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go

index 1b982f6c86fa53b2c0d11d1aed647179b1470944..35b33b937684285aa3bdcb38187d34c26c8ffca4 100644 (file)
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
         return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
  }
  
+func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+       return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
  func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
         return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
  }
@@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
                 si := c.regoff(&p.From)
                 Rj := uint32(p.From.Reg & EXT_REG_MASK)
                 Vd := uint32(p.To.Reg & EXT_REG_MASK)
-               o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+               switch v & 0xc00000 {
+               case 0x800000: // [x]vldrepl.b
+                       o1 = OP_12IRR(v, uint32(si), Rj, Vd)
+               case 0x400000: // [x]vldrepl.h
+                       if si&1 != 0 {
+                               c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
+                       }
+                       o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
+               case 0x0:
+                       switch v & 0x300000 {
+                       case 0x200000: // [x]vldrepl.w
+                               if si&3 != 0 {
+                                       c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+                               }
+                               o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
+                       case 0x100000: // [x]vldrepl.d
+                               if si&7 != 0 {
+                                       c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
+                               }
+                               o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
+                       }
+               }
  
         case 47: // preld  offset(Rbase), $hint
                 offs := c.regoff(&p.From)
diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go

index 6c8f2618a2cb73c326f2733b2e8c2019d8c63fa1..20c5a9e0a6faa8fc02aa7464dab3ab7619557b72 100644 (file)
--- a/src/cmd/internal/obj/loong64/doc.go
+++ b/src/cmd/internal/obj/loong64/doc.go
@@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
         XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
         XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
  
+       note: In Go assembly, for ease of understanding, offset representing the actual address offset.
+             However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
+
+          Go assembly           |      platform assembly
+         VMOVQ  1(R4), V5.B16    |      vldrepl.b  v5, r4, $1
+         VMOVQ  2(R4), V5.H8     |      vldrepl.h  v5, r4, $1
+         VMOVQ  8(R4), V5.W4     |      vldrepl.w  v5, r4, $2
+         VMOVQ  8(R4), V5.V2     |      vldrepl.d  v5, r4, $1
+
  # Special instruction encoding definition and description on LoongArch
  
   1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
diff --git a/src/internal/chacha8rand/chacha8_loong64.s b/src/internal/chacha8rand/chacha8_loong64.s

index 5e6857ed3a65982f847c3446e3855a0f28361198..73a1e5bf05f659ec14020eab8180f1bbdbecc65c 100644 (file)
--- a/src/internal/chacha8rand/chacha8_loong64.s
+++ b/src/internal/chacha8rand/chacha8_loong64.s
@@ -50,22 +50,22 @@ lsx_chacha8:
  
         // load contants
         VMOVQ   (R10), V0.W4
-       VMOVQ   1(R10), V1.W4
-       VMOVQ   2(R10), V2.W4
-       VMOVQ   3(R10), V3.W4
+       VMOVQ   4(R10), V1.W4
+       VMOVQ   8(R10), V2.W4
+       VMOVQ   12(R10), V3.W4
  
         // load 4-32bit data from incRotMatrix added to counter
         VMOVQ   (R11), V30
  
         // load seed
         VMOVQ   (R4), V4.W4
-       VMOVQ   1(R4), V5.W4
-       VMOVQ   2(R4), V6.W4
-       VMOVQ   3(R4), V7.W4
-       VMOVQ   4(R4), V8.W4
-       VMOVQ   5(R4), V9.W4
-       VMOVQ   6(R4), V10.W4
-       VMOVQ   7(R4), V11.W4
+       VMOVQ   4(R4), V5.W4
+       VMOVQ   8(R4), V6.W4
+       VMOVQ   12(R4), V7.W4
+       VMOVQ   16(R4), V8.W4
+       VMOVQ   20(R4), V9.W4
+       VMOVQ   24(R4), V10.W4
+       VMOVQ   28(R4), V11.W4
  
         // load counter and update counter
         VMOVQ   R6, V12.W4
author	Xiaolin Zhao <zhaoxiaolin@loongson.cn>
	Fri, 29 Aug 2025 08:20:16 +0000 (16:20 +0800)
committer	Gopher Robot <gobot@golang.org>
	Thu, 4 Sep 2025 16:22:33 +0000 (09:22 -0700)
src/cmd/asm/internal/asm/testdata/loong64enc1.s		patch \| blob \| history
src/cmd/internal/obj/loong64/asm.go		patch \| blob \| history
src/cmd/internal/obj/loong64/doc.go		patch \| blob \| history
src/internal/chacha8rand/chacha8_loong64.s		patch \| blob \| history