offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+ offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+ offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
+
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build !amd64 && !arm && !arm64 && !mips64le && !mips64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+//go:build !amd64 && !arm && !arm64 && !loong64 && !mips64le && !mips64 && !ppc64le && !ppc64 && !riscv64 && !s390x
package bytealg
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+ // R4 = b_base
+ // R5 = b_len
+ // R6 = b_cap (unused)
+ // R7 = byte to count
+ AND $0xff, R7, R6
+ JMP countbody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+ // R4 = s_base
+ // R5 = s_len
+ // R6 = byte to count
+ AND $0xff, R6
+ JMP countbody<>(SB)
+
+// input:
+// R4 = s_base
+// R5 = s_len
+// R6 = byte to count
+TEXT countbody<>(SB),NOSPLIT,$0
+ MOVV R0, R7 // count
+
+ // short path to handle 0-byte case
+ BEQ R5, done
+
+ // jump directly to tail length < 8
+ MOVV $8, R8
+ BLT R5, R8, tail
+
+ // Implemented using 256-bit SMID instructions
+lasxCountBody:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
+ BEQ R8, lsxCountBody
+ XVMOVQ R6, X0.B32
+
+ // jump directly to lasx32 if length < 128
+ MOVV $128, R8
+ BLT R5, R8, lasx32
+lasx128:
+lasx128Loop:
+ XVMOVQ 0(R4), X1
+ XVMOVQ 32(R4), X2
+ XVMOVQ 64(R4), X3
+ XVMOVQ 96(R4), X4
+
+ XVSEQB X0, X1, X5
+ XVSEQB X0, X2, X6
+ XVSEQB X0, X3, X7
+ XVSEQB X0, X4, X8
+
+ XVANDB $1, X5, X5
+ XVANDB $1, X6, X6
+ XVANDB $1, X7, X7
+ XVANDB $1, X8, X8
+
+ XVPCNTV X5, X1
+ XVPCNTV X6, X2
+ XVPCNTV X7, X3
+ XVPCNTV X8, X4
+
+ XVADDV X2, X1
+ XVADDV X4, X3
+ XVADDV X3, X1
+
+ XVMOVQ X1.V[0], R9
+ XVMOVQ X1.V[1], R10
+ XVMOVQ X1.V[2], R11
+ XVMOVQ X1.V[3], R12
+
+ ADDV R9, R10
+ ADDV R11, R12
+ ADDV R10, R7
+ ADDV R12, R7
+
+ ADDV $-128, R5
+ ADDV $128, R4
+ BGE R5, R8, lasx128Loop
+
+lasx32:
+ // jump directly to lasx8 if length < 32
+ MOVV $32, R8
+ BLT R5, R8, lasx8
+lasx32Loop:
+ XVMOVQ 0(R4), X1
+ XVSEQB X0, X1, X2
+ XVANDB $1, X2, X2
+ XVPCNTV X2, X1
+ XVMOVQ X1.V[0], R9
+ XVMOVQ X1.V[1], R10
+ XVMOVQ X1.V[2], R11
+ XVMOVQ X1.V[3], R12
+ ADDV R9, R10
+ ADDV R11, R12
+ ADDV R10, R7
+ ADDV R12, R7
+ ADDV $-32, R5
+ ADDV $32, R4
+ BGE R5, R8, lasx32Loop
+lasx8:
+ // jump directly to tail if length < 8
+ MOVV $8, R8
+ BLT R5, R8, tail
+lasx8Loop:
+ MOVV 0(R4), R9
+ VMOVQ R9, V1.V[0]
+ VSEQB V0, V1, V2
+ VANDB $1, V2, V2
+ VPCNTV V2, V1
+
+ VMOVQ V1.V[0], R9
+ ADDV R9, R7
+ ADDV $-8, R5
+ ADDV $8, R4
+ BGE R5, R8, lasx8Loop
+ JMP tail
+
+ // Implemented using 128-bit SMID instructions
+lsxCountBody:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
+ BEQ R8, genericCountBody
+ VMOVQ R6, V0.B16
+
+ // jump directly to lsx16 if length < 64
+ MOVV $64, R8
+ BLT R5, R8, lsx16
+lsx64:
+lsx64Loop:
+ VMOVQ 0(R4), V1
+ VMOVQ 16(R4), V2
+ VMOVQ 32(R4), V3
+ VMOVQ 48(R4), V4
+
+ VSEQB V0, V1, V5
+ VSEQB V0, V2, V6
+ VSEQB V0, V3, V7
+ VSEQB V0, V4, V8
+
+ VANDB $1, V5, V5
+ VANDB $1, V6, V6
+ VANDB $1, V7, V7
+ VANDB $1, V8, V8
+
+ VPCNTV V5, V1
+ VPCNTV V6, V2
+ VPCNTV V7, V3
+ VPCNTV V8, V4
+
+ VADDV V2, V1
+ VADDV V4, V3
+ VADDV V3, V1
+
+ VMOVQ V1.V[0], R9
+ VMOVQ V1.V[1], R10
+ ADDV R9, R7
+ ADDV R10, R7
+
+ ADDV $-64, R5
+ ADDV $64, R4
+ BGE R5, R8, lsx64Loop
+
+lsx16:
+ // jump directly to lsx8 if length < 16
+ MOVV $16, R8
+ BLT R5, R8, lsx8
+lsx16Loop:
+ VMOVQ 0(R4), V1
+ VSEQB V0, V1, V2
+ VANDB $1, V2, V2
+ VPCNTV V2, V1
+ VMOVQ V1.V[0], R9
+ VMOVQ V1.V[1], R10
+ ADDV R9, R7
+ ADDV R10, R7
+ ADDV $-16, R5
+ ADDV $16, R4
+ BGE R5, R8, lsx16Loop
+lsx8:
+ // jump directly to tail if length < 8
+ MOVV $8, R8
+ BLT R5, R8, tail
+lsx8Loop:
+ MOVV 0(R4), R9
+ VMOVQ R9, V1.V[0]
+ VSEQB V0, V1, V2
+ VANDB $1, V2, V2
+ VPCNTV V2, V1
+
+ VMOVQ V1.V[0], R9
+ ADDV R9, R7
+ ADDV $-8, R5
+ ADDV $8, R4
+ BGE R5, R8, lsx8Loop
+ JMP tail
+
+ // Implemented using general instructions
+genericCountBody:
+ MOVV $4, R8
+ MOVV $1, R9
+genericLoop:
+ BLT R5, R8, tail
+ ADDV $-4, R5
+ MOVWU (R4)(R5), R10
+ BSTRPICKW $7, R10, $0, R11
+ BSTRPICKW $15, R10, $8, R12
+ XOR R6, R11
+ XOR R6, R12
+ MASKNEZ R11, R9, R13
+ MASKNEZ R12, R9, R14
+ ADDV R13, R7
+ ADDV R14, R7
+ BSTRPICKW $23, R10, $16, R11
+ BSTRPICKW $31, R10, $24, R12
+ XOR R6, R11
+ XOR R6, R12
+ MASKNEZ R11, R9, R13
+ MASKNEZ R12, R9, R14
+ ADDV R13, R7
+ ADDV R14, R7
+ JMP genericLoop
+
+ // Work with tail shorter than 8 bytes
+tail:
+ BEQ R5, done
+ ADDV $-1, R5
+ MOVBU (R4)(R5), R8
+ BNE R6, R8, tail
+ ADDV $1, R7
+ JMP tail
+done:
+ MOVV R7, R4
+ RET
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build amd64 || arm || arm64 || mips64le || mips64 || ppc64le || ppc64 || riscv64 || s390x
+//go:build amd64 || arm || arm64 || loong64 || mips64le || mips64 || ppc64le || ppc64 || riscv64 || s390x
package bytealg
var Loong64 struct {
_ CacheLinePad
HasLSX bool // support 128-bit vector extension
+ HasLASX bool // support 256-bit vector extension
HasCRC32 bool // support CRC instruction
HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
func doinit() {
options = []option{
{Name: "lsx", Feature: &Loong64.HasLSX},
+ {Name: "lasx", Feature: &Loong64.HasLASX},
{Name: "crc32", Feature: &Loong64.HasCRC32},
{Name: "lamcas", Feature: &Loong64.HasLAMCAS},
{Name: "lam_bh", Feature: &Loong64.HasLAM_BH},
// HWCAP bits. These are exposed by the Linux kernel.
const (
- hwcap_LOONGARCH_LSX = 1 << 4
+ hwcap_LOONGARCH_LSX = 1 << 4
+ hwcap_LOONGARCH_LASX = 1 << 5
)
func hwcapInit() {
// TODO: Features that require kernel support like LSX and LASX can
// be detected here once needed in std library or by the compiler.
Loong64.HasLSX = hwcIsSet(HWCap, hwcap_LOONGARCH_LSX)
+ Loong64.HasLASX = hwcIsSet(HWCap, hwcap_LOONGARCH_LASX)
}
func hwcIsSet(hwc uint, val uint) bool {