From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Date: Fri, 29 Aug 2025 08:20:16 +0000 (+0800)
Subject: cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL... 
X-Git-Tag: go1.26rc1~966
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=b8cc907425c4b851d2b941cf689cf8177ea8a153;p=gostls13.git

cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL.{B/H/W/D}

The previously defined usage of offset was ambiguous and not easy to understand.
For example, to fetch 4 bytes of data from the address base+8 and
broadcast it to each word element of vector register V5, the assembly
implementation is as follows:
	previous: VMOVQ 2(base), V5.W4
	current:  VMOVQ 8(base), V5.W4

Change-Id: I8bc84e35033ab63bd10f4c61618789f94314f78c
Reviewed-on: https://go-review.googlesource.com/c/go/+/699875
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---

diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
index 63676cc785..c5c6a4479a 100644
--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
@@ -538,13 +538,29 @@ lable2:
 
 	// Load data from memory and broadcast to each element of a vector register: VMOVQ    offset(Rj), <Vd>.<T>
 	VMOVQ		(R4), V0.B16	// 80008030
-	VMOVQ		1(R4), V1.H8	// 81044030
-	VMOVQ		2(R4), V2.W4	// 82082030
-	VMOVQ		3(R4), V3.V2	// 830c1030
+	VMOVQ		1(R4), V0.B16	// 80048030
+	VMOVQ		-3(R4), V0.B16	// 80f4bf30
+	VMOVQ		(R4), V1.H8	// 81004030
+	VMOVQ		2(R4), V1.H8	// 81044030
+	VMOVQ		-6(R4), V1.H8	// 81f45f30
+	VMOVQ		(R4), V2.W4	// 82002030
+	VMOVQ		8(R4), V2.W4	// 82082030
+	VMOVQ		-12(R4), V2.W4	// 82f42f30
+	VMOVQ		(R4), V3.V2	// 83001030
+	VMOVQ		24(R4), V3.V2	// 830c1030
+	VMOVQ		-16(R4), V3.V2	// 83f81730
 	XVMOVQ		(R4), X0.B32	// 80008032
-	XVMOVQ		1(R4), X1.H16	// 81044032
-	XVMOVQ		2(R4), X2.W8	// 82082032
-	XVMOVQ		3(R4), X3.V4	// 830c1032
+	XVMOVQ		1(R4), X0.B32	// 80048032
+	XVMOVQ		-5(R4), X0.B32	// 80ecbf32
+	XVMOVQ		(R4), X1.H16	// 81004032
+	XVMOVQ		2(R4), X1.H16	// 81044032
+	XVMOVQ		-10(R4), X1.H16	// 81ec5f32
+	XVMOVQ		(R4), X2.W8	// 82002032
+	XVMOVQ		8(R4), X2.W8	// 82082032
+	XVMOVQ		-20(R4), X2.W8	// 82ec2f32
+	XVMOVQ		(R4), X3.V4	// 83001032
+	XVMOVQ		24(R4), X3.V4	// 830c1032
+	XVMOVQ		-24(R4), X3.V4	// 83f41732
 
 	// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
 	VSEQB		V1, V2, V3      // 43040070
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 1b982f6c86..35b33b9376 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
 	return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
 }
 
+func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+	return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+	return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
+func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+	return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
 func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
 	return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
 }
@@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		si := c.regoff(&p.From)
 		Rj := uint32(p.From.Reg & EXT_REG_MASK)
 		Vd := uint32(p.To.Reg & EXT_REG_MASK)
-		o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+		switch v & 0xc00000 {
+		case 0x800000: // [x]vldrepl.b
+			o1 = OP_12IRR(v, uint32(si), Rj, Vd)
+		case 0x400000: // [x]vldrepl.h
+			if si&1 != 0 {
+				c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
+			}
+			o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
+		case 0x0:
+			switch v & 0x300000 {
+			case 0x200000: // [x]vldrepl.w
+				if si&3 != 0 {
+					c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+				}
+				o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
+			case 0x100000: // [x]vldrepl.d
+				if si&7 != 0 {
+					c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
+				}
+				o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
+			}
+		}
 
 	case 47: // preld  offset(Rbase), $hint
 		offs := c.regoff(&p.From)
diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go
index 6c8f2618a2..20c5a9e0a6 100644
--- a/src/cmd/internal/obj/loong64/doc.go
+++ b/src/cmd/internal/obj/loong64/doc.go
@@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
 	XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
 	XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
 
+	note: In Go assembly, for ease of understanding, offset representing the actual address offset.
+	      However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
+
+	   Go assembly           |      platform assembly
+         VMOVQ  1(R4), V5.B16    |      vldrepl.b  v5, r4, $1
+         VMOVQ  2(R4), V5.H8     |      vldrepl.h  v5, r4, $1
+         VMOVQ  8(R4), V5.W4     |      vldrepl.w  v5, r4, $2
+         VMOVQ  8(R4), V5.V2     |      vldrepl.d  v5, r4, $1
+
 # Special instruction encoding definition and description on LoongArch
 
  1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
diff --git a/src/internal/chacha8rand/chacha8_loong64.s b/src/internal/chacha8rand/chacha8_loong64.s
index 5e6857ed3a..73a1e5bf05 100644
--- a/src/internal/chacha8rand/chacha8_loong64.s
+++ b/src/internal/chacha8rand/chacha8_loong64.s
@@ -50,22 +50,22 @@ lsx_chacha8:
 
 	// load contants
 	VMOVQ	(R10), V0.W4
-	VMOVQ	1(R10), V1.W4
-	VMOVQ	2(R10), V2.W4
-	VMOVQ	3(R10), V3.W4
+	VMOVQ	4(R10), V1.W4
+	VMOVQ	8(R10), V2.W4
+	VMOVQ	12(R10), V3.W4
 
 	// load 4-32bit data from incRotMatrix added to counter
 	VMOVQ	(R11), V30
 
 	// load seed
 	VMOVQ	(R4), V4.W4
-	VMOVQ	1(R4), V5.W4
-	VMOVQ	2(R4), V6.W4
-	VMOVQ	3(R4), V7.W4
-	VMOVQ	4(R4), V8.W4
-	VMOVQ	5(R4), V9.W4
-	VMOVQ	6(R4), V10.W4
-	VMOVQ	7(R4), V11.W4
+	VMOVQ	4(R4), V5.W4
+	VMOVQ	8(R4), V6.W4
+	VMOVQ	12(R4), V7.W4
+	VMOVQ	16(R4), V8.W4
+	VMOVQ	20(R4), V9.W4
+	VMOVQ	24(R4), V10.W4
+	VMOVQ	28(R4), V11.W4
 
 	// load counter and update counter
 	VMOVQ	R6, V12.W4