From bf00f5dfd6152c00881ce10275ed006e0b991c11 Mon Sep 17 00:00:00 2001 From: David Chase Date: Wed, 20 Aug 2025 17:29:04 -0400 Subject: [PATCH] [dev.simd] simd, cmd/compile: added simd methods for VSHUFP[DS] These are package private, and will be hidden behind other methods in a following CL with a more general interface. Change-Id: Id090a5de06a0e2aed5cc60a11ff627c5e3b9c52d Reviewed-on: https://go-review.googlesource.com/c/go/+/698577 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 12 +- .../compile/internal/ssa/_gen/simdAMD64.rules | 20 ++ .../compile/internal/ssa/_gen/simdAMD64ops.go | 8 + .../internal/ssa/_gen/simdgenericOps.go | 18 ++ src/cmd/compile/internal/ssa/opGen.go | 256 +++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 114 ++++++++ .../compile/internal/ssagen/simdintrinsics.go | 18 ++ src/go/build/deps_test.go | 3 + .../_gen/simdgen/ops/Moves/categories.yaml | 55 ++++ src/simd/_gen/simdgen/ops/Moves/go.yaml | 207 +++++++++++++ src/simd/internal/simd_test/helpers_test.go | 90 +----- src/simd/internal/test_helpers/checkslices.go | 123 ++++++++ src/simd/ops_amd64.go | 271 ++++++++++++++++++ src/simd/pkginternal_test.go | 48 ++++ 14 files changed, 1154 insertions(+), 89 deletions(-) create mode 100644 src/simd/internal/test_helpers/checkslices.go create mode 100644 src/simd/pkginternal_test.go diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 462b046d37..d69740cd96 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1074,7 +1074,13 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDD512, ssa.OpAMD64VPSHRDQ128, ssa.OpAMD64VPSHRDQ256, - ssa.OpAMD64VPSHRDQ512: + ssa.OpAMD64VPSHRDQ512, + ssa.OpAMD64VSHUFPS128, + ssa.OpAMD64VSHUFPD128, + ssa.OpAMD64VSHUFPS256, + ssa.OpAMD64VSHUFPS512, + ssa.OpAMD64VSHUFPD256, + ssa.OpAMD64VSHUFPD512: p = simdV21Imm8(s, v) case ssa.OpAMD64VCMPPS512, @@ -1878,7 +1884,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDD512load, ssa.OpAMD64VPSHRDQ128load, ssa.OpAMD64VPSHRDQ256load, - ssa.OpAMD64VPSHRDQ512load: + ssa.OpAMD64VPSHRDQ512load, + ssa.OpAMD64VSHUFPS512load, + ssa.OpAMD64VSHUFPD512load: p = simdV21loadImm8(s, v) case ssa.OpAMD64VCMPPS512load, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index b6a7394a73..9db223c04f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1279,6 +1279,24 @@ (blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM mask)) (blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM mask)) (blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM mask)) +(concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...) +(concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...) +(concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...) +(concatSelectedConstantInt64x2 ...) => (VSHUFPD128 ...) +(concatSelectedConstantUint32x4 ...) => (VSHUFPS128 ...) +(concatSelectedConstantUint64x2 ...) => (VSHUFPD128 ...) +(concatSelectedConstantGroupedFloat32x8 ...) => (VSHUFPS256 ...) +(concatSelectedConstantGroupedFloat32x16 ...) => (VSHUFPS512 ...) +(concatSelectedConstantGroupedFloat64x4 ...) => (VSHUFPD256 ...) +(concatSelectedConstantGroupedFloat64x8 ...) => (VSHUFPD512 ...) +(concatSelectedConstantGroupedInt32x8 ...) => (VSHUFPS256 ...) +(concatSelectedConstantGroupedInt32x16 ...) => (VSHUFPS512 ...) +(concatSelectedConstantGroupedInt64x4 ...) => (VSHUFPD256 ...) +(concatSelectedConstantGroupedInt64x8 ...) => (VSHUFPD512 ...) +(concatSelectedConstantGroupedUint32x8 ...) => (VSHUFPS256 ...) +(concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...) +(concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...) +(concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...) (moveMaskedFloat32x16 x mask) => (VMOVUPSMasked512 x (VPMOVVec32x16ToM mask)) (moveMaskedFloat64x8 x mask) => (VMOVUPDMasked512 x (VPMOVVec64x8ToM mask)) (moveMaskedInt8x64 x mask) => (VMOVDQU8Masked512 x (VPMOVVec8x64ToM mask)) @@ -1993,6 +2011,8 @@ (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPXORQMasked512load {sym} [off] x ptr mask mem) (VPBLENDMDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMDMasked512load {sym} [off] x ptr mask mem) (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem) +(VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) +(VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) (VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index b9f0b866a0..ba91fb3fc9 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1256,6 +1256,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHRDQMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDQMasked512", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VSHUFPS128", argLength: 2, reg: v21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VSHUFPD128", argLength: 2, reg: v21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VSHUFPS256", argLength: 2, reg: v21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VSHUFPS512", argLength: 2, reg: w21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VSHUFPD256", argLength: 2, reg: v21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VSHUFPD512", argLength: 2, reg: w21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSLLW128const", argLength: 1, reg: v11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSLLW256const", argLength: 1, reg: v11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSLLW512const", argLength: 1, reg: w11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -1834,6 +1840,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHRDQMasked128load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSHRDQMasked256load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec256", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSHRDQMasked512load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, + {name: "VSHUFPS512load", argLength: 3, reg: w21load, asm: "VSHUFPS", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, + {name: "VSHUFPD512load", argLength: 3, reg: w21load, asm: "VSHUFPD", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSLLD512constload", argLength: 2, reg: w11load, asm: "VPSLLD", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSLLQ512constload", argLength: 2, reg: w11load, asm: "VPSLLQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSLLDMasked128constload", argLength: 3, reg: wkwload, asm: "VPSLLD", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 7ee4989d89..81a1dff137 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1257,5 +1257,23 @@ func simdGenericOps() []opData { {name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedFloat32x16", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedFloat64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedFloat64x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedInt32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedInt32x16", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedInt64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedInt64x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedUint32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedUint32x16", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedUint64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantGroupedUint64x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantInt32x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"}, } } diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index cb0ffa8e80..792a1ca08f 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2488,6 +2488,12 @@ const ( OpAMD64VPSHRDQMasked128 OpAMD64VPSHRDQMasked256 OpAMD64VPSHRDQMasked512 + OpAMD64VSHUFPS128 + OpAMD64VSHUFPD128 + OpAMD64VSHUFPS256 + OpAMD64VSHUFPS512 + OpAMD64VSHUFPD256 + OpAMD64VSHUFPD512 OpAMD64VPSLLW128const OpAMD64VPSLLW256const OpAMD64VPSLLW512const @@ -3066,6 +3072,8 @@ const ( OpAMD64VPSHRDQMasked128load OpAMD64VPSHRDQMasked256load OpAMD64VPSHRDQMasked512load + OpAMD64VSHUFPS512load + OpAMD64VSHUFPD512load OpAMD64VPSLLD512constload OpAMD64VPSLLQ512constload OpAMD64VPSLLDMasked128constload @@ -6644,6 +6652,24 @@ const ( OpTruncScaledResidueFloat64x2 OpTruncScaledResidueFloat64x4 OpTruncScaledResidueFloat64x8 + OpconcatSelectedConstantFloat32x4 + OpconcatSelectedConstantFloat64x2 + OpconcatSelectedConstantGroupedFloat32x8 + OpconcatSelectedConstantGroupedFloat32x16 + OpconcatSelectedConstantGroupedFloat64x4 + OpconcatSelectedConstantGroupedFloat64x8 + OpconcatSelectedConstantGroupedInt32x8 + OpconcatSelectedConstantGroupedInt32x16 + OpconcatSelectedConstantGroupedInt64x4 + OpconcatSelectedConstantGroupedInt64x8 + OpconcatSelectedConstantGroupedUint32x8 + OpconcatSelectedConstantGroupedUint32x16 + OpconcatSelectedConstantGroupedUint64x4 + OpconcatSelectedConstantGroupedUint64x8 + OpconcatSelectedConstantInt32x4 + OpconcatSelectedConstantInt64x2 + OpconcatSelectedConstantUint32x4 + OpconcatSelectedConstantUint64x2 ) var opcodeTable = [...]opInfo{ @@ -38308,6 +38334,96 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VSHUFPS128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VSHUFPD128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VSHUFPS256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VSHUFPS512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VSHUFPD256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VSHUFPD512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVSHUFPD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPSLLW128const", auxType: auxUInt8, @@ -47864,6 +47980,38 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VSHUFPS512load", + auxType: auxSymValAndOff, + argLen: 3, + symEffect: SymRead, + asm: x86.AVSHUFPS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VSHUFPD512load", + auxType: auxSymValAndOff, + argLen: 3, + symEffect: SymRead, + asm: x86.AVSHUFPD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPSLLD512constload", auxType: auxSymValAndOff, @@ -82560,6 +82708,114 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "concatSelectedConstantFloat32x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantFloat64x2", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedFloat32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedFloat32x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedFloat64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedFloat64x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedInt32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedInt32x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedInt64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedInt64x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedUint32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedUint32x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedUint64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantGroupedUint64x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantInt32x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantInt64x2", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantUint32x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "concatSelectedConstantUint64x2", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, } func (o Op) Asm() obj.As { return opcodeTable[o].asm } diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 26a06fc3fc..747b337192 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1715,6 +1715,10 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VSCALEFPSMasked256(v) case OpAMD64VSCALEFPSMasked512: return rewriteValueAMD64_OpAMD64VSCALEFPSMasked512(v) + case OpAMD64VSHUFPD512: + return rewriteValueAMD64_OpAMD64VSHUFPD512(v) + case OpAMD64VSHUFPS512: + return rewriteValueAMD64_OpAMD64VSHUFPS512(v) case OpAMD64VSQRTPD512: return rewriteValueAMD64_OpAMD64VSQRTPD512(v) case OpAMD64VSQRTPDMasked128: @@ -5992,6 +5996,60 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpblendMaskedInt64x8(v) case OpblendMaskedInt8x64: return rewriteValueAMD64_OpblendMaskedInt8x64(v) + case OpconcatSelectedConstantFloat32x4: + v.Op = OpAMD64VSHUFPS128 + return true + case OpconcatSelectedConstantFloat64x2: + v.Op = OpAMD64VSHUFPD128 + return true + case OpconcatSelectedConstantGroupedFloat32x16: + v.Op = OpAMD64VSHUFPS512 + return true + case OpconcatSelectedConstantGroupedFloat32x8: + v.Op = OpAMD64VSHUFPS256 + return true + case OpconcatSelectedConstantGroupedFloat64x4: + v.Op = OpAMD64VSHUFPD256 + return true + case OpconcatSelectedConstantGroupedFloat64x8: + v.Op = OpAMD64VSHUFPD512 + return true + case OpconcatSelectedConstantGroupedInt32x16: + v.Op = OpAMD64VSHUFPS512 + return true + case OpconcatSelectedConstantGroupedInt32x8: + v.Op = OpAMD64VSHUFPS256 + return true + case OpconcatSelectedConstantGroupedInt64x4: + v.Op = OpAMD64VSHUFPD256 + return true + case OpconcatSelectedConstantGroupedInt64x8: + v.Op = OpAMD64VSHUFPD512 + return true + case OpconcatSelectedConstantGroupedUint32x16: + v.Op = OpAMD64VSHUFPS512 + return true + case OpconcatSelectedConstantGroupedUint32x8: + v.Op = OpAMD64VSHUFPS256 + return true + case OpconcatSelectedConstantGroupedUint64x4: + v.Op = OpAMD64VSHUFPD256 + return true + case OpconcatSelectedConstantGroupedUint64x8: + v.Op = OpAMD64VSHUFPD512 + return true + case OpconcatSelectedConstantInt32x4: + v.Op = OpAMD64VSHUFPS128 + return true + case OpconcatSelectedConstantInt64x2: + v.Op = OpAMD64VSHUFPD128 + return true + case OpconcatSelectedConstantUint32x4: + v.Op = OpAMD64VSHUFPS128 + return true + case OpconcatSelectedConstantUint64x2: + v.Op = OpAMD64VSHUFPD128 + return true case OpmoveMaskedFloat32x16: return rewriteValueAMD64_OpmoveMaskedFloat32x16(v) case OpmoveMaskedFloat64x8: @@ -47442,6 +47500,62 @@ func rewriteValueAMD64_OpAMD64VSCALEFPSMasked512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VSHUFPD512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + l := v_1 + if l.Op != OpAMD64VMOVDQUload512 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VSHUFPD512load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg3(x, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VSHUFPS512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + l := v_1 + if l.Op != OpAMD64VMOVDQUload512 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VSHUFPS512load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg3(x, ptr, mem) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VSQRTPD512(v *Value) bool { v_0 := v.Args[0] // match: (VSQRTPD512 l:(VMOVDQUload512 {sym} [off] ptr mem)) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 4f933de008..41858a7745 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -1255,6 +1255,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Float32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Float32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Float64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Float64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Float32x16.moveMasked", opLen2(ssa.OpmoveMaskedFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.moveMasked", opLen2(ssa.OpmoveMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x64.moveMasked", opLen2(ssa.OpmoveMaskedInt8x64, types.TypeVec512), sys.AMD64) diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go index 99e1554c83..fd4432b87e 100644 --- a/src/go/build/deps_test.go +++ b/src/go/build/deps_test.go @@ -687,6 +687,9 @@ var depsRules = ` FMT, DEBUG, flag, runtime/trace, internal/sysinfo, math/rand < testing; + testing, math + < simd/internal/test_helpers; + log/slog, testing < testing/slogtest; diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index 27e67f4787..e9a7fef202 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -120,3 +120,58 @@ documentation: !string |- // NAME interleaves the elements of the low half of each 128-bit subvector of x and y. +- go: concatSelectedConstant + commutative: false + out: + - elemBits: 32 + documentation: !string |- + // NAME concatenates selected elements from x and y into the lower and upper + // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 + // where each {h,l}{1,0} is two bits specify which element from y or x to select. + // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns + // {2, 1, 4, 6} (don't forget that the binary constant is written big-endian). + +- go: concatSelectedConstant + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME concatenates selected elements from x and y into the lower and upper + // halves of the output. The selection is chosen by the constant parameter hilo + // where hi and lo are each one bit specifying which 64-bit element to select + // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) + // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, + // selecting from y, is 1, and selects 7. + +- go: concatSelectedConstantGrouped + commutative: false + out: + - elemBits: 32 + documentation: !string |- + // NAME concatenates selected elements from 128-bit subvectors of x and y + // into the lower and upper halves of corresponding subvectors of the output. + // The selection is chosen by the constant parameter h1h0l1l0 + // where each {h,l}{1,0} is two bits specify which element from y or x to select. + // For example, + // {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) + // returns {2,0,5,7,10,8,13,15} + // (don't forget that the binary constant is written big-endian). + +- go: concatSelectedConstantGrouped + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME concatenates selected elements from 128-bit subvectors of x and y + // into the lower and upper halves of corresponding subvectors of the output. + // The selections are specified by the constant parameter hilos where each + // hi and lo pair select 64-bit elements from the corresponding 128-bit + // subvectors of x and y. + // + // For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) + // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least + // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), + // then 1, selecting element 1 from x's upper 128 bits (9), then 1, + // selecting element 1 from y's upper 128 bits (11). + // This differs from the same method applied to a 32x8 vector, where + // the 8-bit constant performs the same selection on both subvectors. \ No newline at end of file diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index eb14058a88..46599b7bd7 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -564,3 +564,210 @@ out: - *256Or512any +# These are all described separately to carry the name of the constant parameter + +- go: concatSelectedConstant + asm: VSHUFPS + width: 32 + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + +- go: concatSelectedConstant + asm: VSHUFPS + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + OverwriteBase: int + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + +- go: concatSelectedConstant + asm: VSHUFPS + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + OverwriteBase: uint + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + + +- go: concatSelectedConstantGrouped + asm: VSHUFPS + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + +- go: concatSelectedConstantGrouped + asm: VSHUFPS + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + OverwriteBase: int + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + +- go: concatSelectedConstantGrouped + asm: VSHUFPS + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + OverwriteBase: uint + - *v + - class: immediate + immOffset: 0 + name: h1h0l1l0 + inVariant: [] + out: + - *v + + + # 64 bit versions + +- go: concatSelectedConstant + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + - *v + - class: immediate + immOffset: 0 + name: hilo + inVariant: [] + out: + - *v + +- go: concatSelectedConstant + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + OverwriteBase: int + - *v + - class: immediate + immOffset: 0 + name: hilo + inVariant: [] + out: + - *v + +- go: concatSelectedConstant + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: 128 + OverwriteBase: uint + - *v + - class: immediate + immOffset: 0 + name: hilo + inVariant: [] + out: + - *v + + +- go: concatSelectedConstantGrouped + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + - *v + - class: immediate + immOffset: 0 + name: hilos + inVariant: [] + out: + - *v + +- go: concatSelectedConstantGrouped + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + OverwriteBase: int + - *v + - class: immediate + immOffset: 0 + name: hilos + inVariant: [] + out: + - *v + +- go: concatSelectedConstantGrouped + asm: VSHUFPD + in: + - &v + go: $t + class: vreg + base: float + bits: "256|512" + OverwriteBase: uint + - *v + - class: immediate + immOffset: 0 + name: hilos + inVariant: [] + out: + - *v diff --git a/src/simd/internal/simd_test/helpers_test.go b/src/simd/internal/simd_test/helpers_test.go index 6c681abe98..0a246e0d7d 100644 --- a/src/simd/internal/simd_test/helpers_test.go +++ b/src/simd/internal/simd_test/helpers_test.go @@ -8,6 +8,7 @@ package simd_test import ( "math" + "simd/internal/test_helpers" "testing" ) @@ -29,97 +30,12 @@ type number interface { func checkSlices[T number](t *testing.T, got, want []T) bool { t.Helper() - return checkSlicesLogInput[T](t, got, want, 0.0, nil) + return test_helpers.CheckSlicesLogInput[T](t, got, want, 0.0, nil) } -// checkSlices compares two slices for equality, -// reporting a test error if there is a problem, -// and also consumes the two slices so that a -// test/benchmark won't be dead-code eliminated. func checkSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool { t.Helper() - var z T - for i := range want { - if got[i] != want[i] { - var ia any = got[i] - var ib any = want[i] - switch x := ia.(type) { - case float32: - y := ib.(float32) - if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) { - continue - } - if flakiness > 0 { - if y == 0 { - if math.Abs(float64(x)) < flakiness { - continue - } - } else { - if math.Abs(float64((x-y)/y)) < flakiness { - continue - } - } - } - case float64: - y := ib.(float64) - if math.IsNaN(x) && math.IsNaN(y) { - continue - } - if flakiness > 0 { - if y == 0 { - if math.Abs(x) < flakiness { - continue - } - } else if math.Abs((x-y)/y) < flakiness { - continue - } - } - - default: - } - - t.Logf("For %T vector elements:", z) - t.Logf("got =%v", got) - t.Logf("want=%v", want) - if logInput != nil { - logInput() - } - t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i]) - return false - } else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference - var ia any = got[i] - var ib any = want[i] - switch x := ia.(type) { - case float32: - y := ib.(float32) - if math.Float32bits(x) != math.Float32bits(y) { - t.Logf("For %T vector elements:", z) - t.Logf("got =%v", got) - t.Logf("want=%v", want) - if logInput != nil { - logInput() - } - t.Errorf("at index %d, different signs of zero", i) - return false - } - case float64: - y := ib.(float64) - if math.Float64bits(x) != math.Float64bits(y) { - t.Logf("For %T vector elements:", z) - t.Logf("got =%v", got) - t.Logf("want=%v", want) - if logInput != nil { - logInput() - } - t.Errorf("at index %d, different signs of zero", i) - return false - } - default: - } - - } - } - return true + return test_helpers.CheckSlicesLogInput[T](t, got, want, flakiness, logInput) } // sliceOf returns a slice n T's, with each diff --git a/src/simd/internal/test_helpers/checkslices.go b/src/simd/internal/test_helpers/checkslices.go new file mode 100644 index 0000000000..54453798a2 --- /dev/null +++ b/src/simd/internal/test_helpers/checkslices.go @@ -0,0 +1,123 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package test_helpers + +import ( + "math" + "testing" +) + +type signed interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 +} + +type integer interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr +} + +type float interface { + ~float32 | ~float64 +} + +type number interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64 +} + +func CheckSlices[T number](t *testing.T, got, want []T) bool { + t.Helper() + return CheckSlicesLogInput[T](t, got, want, 0.0, nil) +} + +// CheckSlices compares two slices for equality, +// reporting a test error if there is a problem, +// and also consumes the two slices so that a +// test/benchmark won't be dead-code eliminated. +func CheckSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool { + t.Helper() + var z T + for i := range want { + if got[i] != want[i] { + var ia any = got[i] + var ib any = want[i] + switch x := ia.(type) { + case float32: + y := ib.(float32) + if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) { + continue + } + if flakiness > 0 { + if y == 0 { + if math.Abs(float64(x)) < flakiness { + continue + } + } else { + if math.Abs(float64((x-y)/y)) < flakiness { + continue + } + } + } + case float64: + y := ib.(float64) + if math.IsNaN(x) && math.IsNaN(y) { + continue + } + if flakiness > 0 { + if y == 0 { + if math.Abs(x) < flakiness { + continue + } + } else if math.Abs((x-y)/y) < flakiness { + continue + } + } + + default: + } + + t.Logf("For %T vector elements:", z) + t.Logf("got =%v", got) + t.Logf("want=%v", want) + if logInput != nil { + logInput() + } + t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i]) + return false + } else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference + var ia any = got[i] + var ib any = want[i] + switch x := ia.(type) { + case float32: + y := ib.(float32) + if math.Float32bits(x) != math.Float32bits(y) { + t.Logf("For %T vector elements:", z) + t.Logf("got =%v", got) + t.Logf("want=%v", want) + if logInput != nil { + logInput() + } + t.Errorf("at index %d, different signs of zero", i) + return false + } + case float64: + y := ib.(float64) + if math.Float64bits(x) != math.Float64bits(y) { + t.Logf("For %T vector elements:", z) + t.Logf("got =%v", got) + t.Logf("want=%v", want) + if logInput != nil { + logInput() + } + t.Errorf("at index %d, different signs of zero", i) + return false + } + default: + } + + } + } + return true +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index c1d0e8338a..a104601ed7 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -7369,6 +7369,277 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 // Asm: VPBLENDMQ, CPU Feature: AVX512 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 +/* concatSelectedConstant */ + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 + +/* concatSelectedConstantGrouped */ + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 + /* moveMasked */ // moveMasked blends a vector with zero, with the original value where the mask is true diff --git a/src/simd/pkginternal_test.go b/src/simd/pkginternal_test.go new file mode 100644 index 0000000000..801cd0d17a --- /dev/null +++ b/src/simd/pkginternal_test.go @@ -0,0 +1,48 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package simd + +import ( + "simd/internal/test_helpers" + "testing" +) + +func TestConcatSelectedConstant64(t *testing.T) { + a := make([]int64, 2) + x := LoadInt64x2Slice([]int64{4, 5}) + y := LoadInt64x2Slice([]int64{6, 7}) + z := x.concatSelectedConstant(0b10, y) + z.StoreSlice(a) + test_helpers.CheckSlices[int64](t, a, []int64{4, 7}) +} + +func TestConcatSelectedConstantGrouped64(t *testing.T) { + a := make([]float64, 4) + x := LoadFloat64x4Slice([]float64{4, 5, 8, 9}) + y := LoadFloat64x4Slice([]float64{6, 7, 10, 11}) + z := x.concatSelectedConstantGrouped(0b_11_10, y) + z.StoreSlice(a) + test_helpers.CheckSlices[float64](t, a, []float64{4, 7, 9, 11}) +} + +func TestConcatSelectedConstant32(t *testing.T) { + a := make([]float32, 4) + x := LoadFloat32x4Slice([]float32{4, 5, 8, 9}) + y := LoadFloat32x4Slice([]float32{6, 7, 10, 11}) + z := x.concatSelectedConstant(0b_11_01_10_00, y) + z.StoreSlice(a) + test_helpers.CheckSlices[float32](t, a, []float32{4, 8, 7, 11}) +} + +func TestConcatSelectedConstantGrouped32(t *testing.T) { + a := make([]uint32, 8) + x := LoadUint32x8Slice([]uint32{0, 1, 2, 3, 8, 9, 10, 11}) + y := LoadUint32x8Slice([]uint32{4, 5, 6, 7, 12, 13, 14, 15}) + z := x.concatSelectedConstantGrouped(0b_11_01_00_10, y) + z.StoreSlice(a) + test_helpers.CheckSlices[uint32](t, a, []uint32{2, 0, 5, 7, 10, 8, 13, 15}) +} -- 2.52.0