From b509516b2e96654be4e6a2dc979414df5df7d14b Mon Sep 17 00:00:00 2001 From: David Chase Date: Wed, 20 Aug 2025 16:58:55 -0400 Subject: [PATCH] [dev.simd] simd, cmd/compile: add Interleave{Hi,Lo} (VPUNPCK*) these are building blocks for transpose, not sure of their best names yet. Change-Id: I3800a55de9fa7fde2590ca822894c8a75387dec3 Reviewed-on: https://go-review.googlesource.com/c/go/+/698576 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 18 + .../compile/internal/ssa/_gen/simdAMD64.rules | 36 ++ .../compile/internal/ssa/_gen/simdAMD64ops.go | 18 + .../internal/ssa/_gen/simdgenericOps.go | 36 ++ src/cmd/compile/internal/ssa/opGen.go | 486 ++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 108 ++++ .../compile/internal/ssagen/simdintrinsics.go | 36 ++ .../_gen/simdgen/ops/Moves/categories.yaml | 19 +- src/simd/_gen/simdgen/ops/Moves/go.yaml | 39 +- src/simd/internal/simd_test/simd_test.go | 24 + src/simd/ops_amd64.go | 188 +++++++ src/simd/shuffles_amd64.go | 15 + 12 files changed, 1021 insertions(+), 2 deletions(-) create mode 100644 src/simd/shuffles_amd64.go diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 8698387235..33f6669300 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -243,6 +243,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCMPGTD256, ssa.OpAMD64VPCMPGTQ128, ssa.OpAMD64VPCMPGTQ256, + ssa.OpAMD64VPUNPCKHWD128, + ssa.OpAMD64VPUNPCKHDQ128, + ssa.OpAMD64VPUNPCKHQDQ128, + ssa.OpAMD64VPUNPCKHWD256, + ssa.OpAMD64VPUNPCKHWD512, + ssa.OpAMD64VPUNPCKHDQ256, + ssa.OpAMD64VPUNPCKHDQ512, + ssa.OpAMD64VPUNPCKHQDQ256, + ssa.OpAMD64VPUNPCKHQDQ512, + ssa.OpAMD64VPUNPCKLWD128, + ssa.OpAMD64VPUNPCKLDQ128, + ssa.OpAMD64VPUNPCKLQDQ128, + ssa.OpAMD64VPUNPCKLWD256, + ssa.OpAMD64VPUNPCKLWD512, + ssa.OpAMD64VPUNPCKLDQ256, + ssa.OpAMD64VPUNPCKLDQ512, + ssa.OpAMD64VPUNPCKLQDQ256, + ssa.OpAMD64VPUNPCKLQDQ512, ssa.OpAMD64VMAXPS128, ssa.OpAMD64VMAXPS256, ssa.OpAMD64VMAXPS512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 5757278f62..35ef1d35b6 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -520,6 +520,42 @@ (GreaterEqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [13] x y)) (GreaterEqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [13] x y)) (GreaterEqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [13] x y)) +(InterleaveHiInt16x8 ...) => (VPUNPCKHWD128 ...) +(InterleaveHiInt32x4 ...) => (VPUNPCKHDQ128 ...) +(InterleaveHiInt64x2 ...) => (VPUNPCKHQDQ128 ...) +(InterleaveHiUint16x8 ...) => (VPUNPCKHWD128 ...) +(InterleaveHiUint32x4 ...) => (VPUNPCKHDQ128 ...) +(InterleaveHiUint64x2 ...) => (VPUNPCKHQDQ128 ...) +(InterleaveHiGroupedInt16x16 ...) => (VPUNPCKHWD256 ...) +(InterleaveHiGroupedInt16x32 ...) => (VPUNPCKHWD512 ...) +(InterleaveHiGroupedInt32x8 ...) => (VPUNPCKHDQ256 ...) +(InterleaveHiGroupedInt32x16 ...) => (VPUNPCKHDQ512 ...) +(InterleaveHiGroupedInt64x4 ...) => (VPUNPCKHQDQ256 ...) +(InterleaveHiGroupedInt64x8 ...) => (VPUNPCKHQDQ512 ...) +(InterleaveHiGroupedUint16x16 ...) => (VPUNPCKHWD256 ...) +(InterleaveHiGroupedUint16x32 ...) => (VPUNPCKHWD512 ...) +(InterleaveHiGroupedUint32x8 ...) => (VPUNPCKHDQ256 ...) +(InterleaveHiGroupedUint32x16 ...) => (VPUNPCKHDQ512 ...) +(InterleaveHiGroupedUint64x4 ...) => (VPUNPCKHQDQ256 ...) +(InterleaveHiGroupedUint64x8 ...) => (VPUNPCKHQDQ512 ...) +(InterleaveLoInt16x8 ...) => (VPUNPCKLWD128 ...) +(InterleaveLoInt32x4 ...) => (VPUNPCKLDQ128 ...) +(InterleaveLoInt64x2 ...) => (VPUNPCKLQDQ128 ...) +(InterleaveLoUint16x8 ...) => (VPUNPCKLWD128 ...) +(InterleaveLoUint32x4 ...) => (VPUNPCKLDQ128 ...) +(InterleaveLoUint64x2 ...) => (VPUNPCKLQDQ128 ...) +(InterleaveLoGroupedInt16x16 ...) => (VPUNPCKLWD256 ...) +(InterleaveLoGroupedInt16x32 ...) => (VPUNPCKLWD512 ...) +(InterleaveLoGroupedInt32x8 ...) => (VPUNPCKLDQ256 ...) +(InterleaveLoGroupedInt32x16 ...) => (VPUNPCKLDQ512 ...) +(InterleaveLoGroupedInt64x4 ...) => (VPUNPCKLQDQ256 ...) +(InterleaveLoGroupedInt64x8 ...) => (VPUNPCKLQDQ512 ...) +(InterleaveLoGroupedUint16x16 ...) => (VPUNPCKLWD256 ...) +(InterleaveLoGroupedUint16x32 ...) => (VPUNPCKLWD512 ...) +(InterleaveLoGroupedUint32x8 ...) => (VPUNPCKLDQ256 ...) +(InterleaveLoGroupedUint32x16 ...) => (VPUNPCKLDQ512 ...) +(InterleaveLoGroupedUint64x4 ...) => (VPUNPCKLQDQ256 ...) +(InterleaveLoGroupedUint64x8 ...) => (VPUNPCKLQDQ512 ...) (IsNanFloat32x4 x y) => (VCMPPS128 [3] x y) (IsNanFloat32x8 x y) => (VCMPPS256 [3] x y) (IsNanFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [3] x y)) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index d473e2c2a9..1448f8776a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -983,6 +983,24 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSUBWMasked128", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSUBWMasked256", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSUBWMasked512", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKHDQ128", argLength: 2, reg: v21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKHDQ256", argLength: 2, reg: v21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKHDQ512", argLength: 2, reg: w21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKHQDQ128", argLength: 2, reg: v21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKHQDQ256", argLength: 2, reg: v21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKHQDQ512", argLength: 2, reg: w21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKHWD128", argLength: 2, reg: v21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKHWD256", argLength: 2, reg: v21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKHWD512", argLength: 2, reg: w21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKLDQ128", argLength: 2, reg: v21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKLDQ256", argLength: 2, reg: v21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKLDQ512", argLength: 2, reg: w21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKLQDQ128", argLength: 2, reg: v21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKLQDQ256", argLength: 2, reg: v21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKLQDQ512", argLength: 2, reg: w21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPUNPCKLWD128", argLength: 2, reg: v21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPUNPCKLWD256", argLength: 2, reg: v21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPUNPCKLWD512", argLength: 2, reg: w21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPXOR128", argLength: 2, reg: v21, asm: "VPXOR", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPXOR256", argLength: 2, reg: v21, asm: "VPXOR", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPXORD512", argLength: 2, reg: w21, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 774fb5cce7..11c5785f7d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -484,6 +484,42 @@ func simdGenericOps() []opData { {name: "GreaterUint16x32", argLength: 2, commutative: false}, {name: "GreaterUint32x16", argLength: 2, commutative: false}, {name: "GreaterUint64x8", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt16x16", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt16x32", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt32x8", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt32x16", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt64x4", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedInt64x8", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint16x16", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint16x32", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint32x8", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint32x16", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint64x4", argLength: 2, commutative: false}, + {name: "InterleaveHiGroupedUint64x8", argLength: 2, commutative: false}, + {name: "InterleaveHiInt16x8", argLength: 2, commutative: false}, + {name: "InterleaveHiInt32x4", argLength: 2, commutative: false}, + {name: "InterleaveHiInt64x2", argLength: 2, commutative: false}, + {name: "InterleaveHiUint16x8", argLength: 2, commutative: false}, + {name: "InterleaveHiUint32x4", argLength: 2, commutative: false}, + {name: "InterleaveHiUint64x2", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt16x16", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt16x32", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt32x8", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt32x16", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt64x4", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedInt64x8", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint16x16", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint16x32", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint32x8", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint32x16", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint64x4", argLength: 2, commutative: false}, + {name: "InterleaveLoGroupedUint64x8", argLength: 2, commutative: false}, + {name: "InterleaveLoInt16x8", argLength: 2, commutative: false}, + {name: "InterleaveLoInt32x4", argLength: 2, commutative: false}, + {name: "InterleaveLoInt64x2", argLength: 2, commutative: false}, + {name: "InterleaveLoUint16x8", argLength: 2, commutative: false}, + {name: "InterleaveLoUint32x4", argLength: 2, commutative: false}, + {name: "InterleaveLoUint64x2", argLength: 2, commutative: false}, {name: "IsNanFloat32x4", argLength: 2, commutative: true}, {name: "IsNanFloat32x8", argLength: 2, commutative: true}, {name: "IsNanFloat32x16", argLength: 2, commutative: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index f0c18d0816..b584d1509d 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2215,6 +2215,24 @@ const ( OpAMD64VPSUBWMasked128 OpAMD64VPSUBWMasked256 OpAMD64VPSUBWMasked512 + OpAMD64VPUNPCKHDQ128 + OpAMD64VPUNPCKHDQ256 + OpAMD64VPUNPCKHDQ512 + OpAMD64VPUNPCKHQDQ128 + OpAMD64VPUNPCKHQDQ256 + OpAMD64VPUNPCKHQDQ512 + OpAMD64VPUNPCKHWD128 + OpAMD64VPUNPCKHWD256 + OpAMD64VPUNPCKHWD512 + OpAMD64VPUNPCKLDQ128 + OpAMD64VPUNPCKLDQ256 + OpAMD64VPUNPCKLDQ512 + OpAMD64VPUNPCKLQDQ128 + OpAMD64VPUNPCKLQDQ256 + OpAMD64VPUNPCKLQDQ512 + OpAMD64VPUNPCKLWD128 + OpAMD64VPUNPCKLWD256 + OpAMD64VPUNPCKLWD512 OpAMD64VPXOR128 OpAMD64VPXOR256 OpAMD64VPXORD512 @@ -5288,6 +5306,42 @@ const ( OpGreaterUint16x32 OpGreaterUint32x16 OpGreaterUint64x8 + OpInterleaveHiGroupedInt16x16 + OpInterleaveHiGroupedInt16x32 + OpInterleaveHiGroupedInt32x8 + OpInterleaveHiGroupedInt32x16 + OpInterleaveHiGroupedInt64x4 + OpInterleaveHiGroupedInt64x8 + OpInterleaveHiGroupedUint16x16 + OpInterleaveHiGroupedUint16x32 + OpInterleaveHiGroupedUint32x8 + OpInterleaveHiGroupedUint32x16 + OpInterleaveHiGroupedUint64x4 + OpInterleaveHiGroupedUint64x8 + OpInterleaveHiInt16x8 + OpInterleaveHiInt32x4 + OpInterleaveHiInt64x2 + OpInterleaveHiUint16x8 + OpInterleaveHiUint32x4 + OpInterleaveHiUint64x2 + OpInterleaveLoGroupedInt16x16 + OpInterleaveLoGroupedInt16x32 + OpInterleaveLoGroupedInt32x8 + OpInterleaveLoGroupedInt32x16 + OpInterleaveLoGroupedInt64x4 + OpInterleaveLoGroupedInt64x8 + OpInterleaveLoGroupedUint16x16 + OpInterleaveLoGroupedUint16x32 + OpInterleaveLoGroupedUint32x8 + OpInterleaveLoGroupedUint32x16 + OpInterleaveLoGroupedUint64x4 + OpInterleaveLoGroupedUint64x8 + OpInterleaveLoInt16x8 + OpInterleaveLoInt32x4 + OpInterleaveLoInt64x2 + OpInterleaveLoUint16x8 + OpInterleaveLoUint32x4 + OpInterleaveLoUint64x2 OpIsNanFloat32x4 OpIsNanFloat32x8 OpIsNanFloat32x16 @@ -33629,6 +33683,258 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPUNPCKHDQ128", + argLen: 2, + asm: x86.AVPUNPCKHDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHDQ256", + argLen: 2, + asm: x86.AVPUNPCKHDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHDQ512", + argLen: 2, + asm: x86.AVPUNPCKHDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPUNPCKHQDQ128", + argLen: 2, + asm: x86.AVPUNPCKHQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHQDQ256", + argLen: 2, + asm: x86.AVPUNPCKHQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHQDQ512", + argLen: 2, + asm: x86.AVPUNPCKHQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPUNPCKHWD128", + argLen: 2, + asm: x86.AVPUNPCKHWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHWD256", + argLen: 2, + asm: x86.AVPUNPCKHWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKHWD512", + argLen: 2, + asm: x86.AVPUNPCKHWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPUNPCKLDQ128", + argLen: 2, + asm: x86.AVPUNPCKLDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLDQ256", + argLen: 2, + asm: x86.AVPUNPCKLDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLDQ512", + argLen: 2, + asm: x86.AVPUNPCKLDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPUNPCKLQDQ128", + argLen: 2, + asm: x86.AVPUNPCKLQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLQDQ256", + argLen: 2, + asm: x86.AVPUNPCKLQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLQDQ512", + argLen: 2, + asm: x86.AVPUNPCKLQDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPUNPCKLWD128", + argLen: 2, + asm: x86.AVPUNPCKLWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLWD256", + argLen: 2, + asm: x86.AVPUNPCKLWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPUNPCKLWD512", + argLen: 2, + asm: x86.AVPUNPCKLWD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPXOR128", argLen: 2, @@ -68116,6 +68422,186 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "InterleaveHiGroupedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedInt16x32", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedInt32x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedInt32x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedInt64x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedInt64x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint16x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint16x32", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint32x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint32x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint64x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiGroupedUint64x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiInt16x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiInt32x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiInt64x2", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiUint16x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiUint32x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveHiUint64x2", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt16x32", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt32x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt32x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt64x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedInt64x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint16x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint16x32", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint32x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint32x16", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint64x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoGroupedUint64x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoInt16x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoInt32x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoInt64x2", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoUint16x8", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoUint32x4", + argLen: 2, + generic: true, + }, + { + name: "InterleaveLoUint64x2", + argLen: 2, + generic: true, + }, { name: "IsNanFloat32x4", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 8fec5d5b9a..236eed8629 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2363,6 +2363,114 @@ func rewriteValueAMD64(v *Value) bool { case OpInterCall: v.Op = OpAMD64CALLinter return true + case OpInterleaveHiGroupedInt16x16: + v.Op = OpAMD64VPUNPCKHWD256 + return true + case OpInterleaveHiGroupedInt16x32: + v.Op = OpAMD64VPUNPCKHWD512 + return true + case OpInterleaveHiGroupedInt32x16: + v.Op = OpAMD64VPUNPCKHDQ512 + return true + case OpInterleaveHiGroupedInt32x8: + v.Op = OpAMD64VPUNPCKHDQ256 + return true + case OpInterleaveHiGroupedInt64x4: + v.Op = OpAMD64VPUNPCKHQDQ256 + return true + case OpInterleaveHiGroupedInt64x8: + v.Op = OpAMD64VPUNPCKHQDQ512 + return true + case OpInterleaveHiGroupedUint16x16: + v.Op = OpAMD64VPUNPCKHWD256 + return true + case OpInterleaveHiGroupedUint16x32: + v.Op = OpAMD64VPUNPCKHWD512 + return true + case OpInterleaveHiGroupedUint32x16: + v.Op = OpAMD64VPUNPCKHDQ512 + return true + case OpInterleaveHiGroupedUint32x8: + v.Op = OpAMD64VPUNPCKHDQ256 + return true + case OpInterleaveHiGroupedUint64x4: + v.Op = OpAMD64VPUNPCKHQDQ256 + return true + case OpInterleaveHiGroupedUint64x8: + v.Op = OpAMD64VPUNPCKHQDQ512 + return true + case OpInterleaveHiInt16x8: + v.Op = OpAMD64VPUNPCKHWD128 + return true + case OpInterleaveHiInt32x4: + v.Op = OpAMD64VPUNPCKHDQ128 + return true + case OpInterleaveHiInt64x2: + v.Op = OpAMD64VPUNPCKHQDQ128 + return true + case OpInterleaveHiUint16x8: + v.Op = OpAMD64VPUNPCKHWD128 + return true + case OpInterleaveHiUint32x4: + v.Op = OpAMD64VPUNPCKHDQ128 + return true + case OpInterleaveHiUint64x2: + v.Op = OpAMD64VPUNPCKHQDQ128 + return true + case OpInterleaveLoGroupedInt16x16: + v.Op = OpAMD64VPUNPCKLWD256 + return true + case OpInterleaveLoGroupedInt16x32: + v.Op = OpAMD64VPUNPCKLWD512 + return true + case OpInterleaveLoGroupedInt32x16: + v.Op = OpAMD64VPUNPCKLDQ512 + return true + case OpInterleaveLoGroupedInt32x8: + v.Op = OpAMD64VPUNPCKLDQ256 + return true + case OpInterleaveLoGroupedInt64x4: + v.Op = OpAMD64VPUNPCKLQDQ256 + return true + case OpInterleaveLoGroupedInt64x8: + v.Op = OpAMD64VPUNPCKLQDQ512 + return true + case OpInterleaveLoGroupedUint16x16: + v.Op = OpAMD64VPUNPCKLWD256 + return true + case OpInterleaveLoGroupedUint16x32: + v.Op = OpAMD64VPUNPCKLWD512 + return true + case OpInterleaveLoGroupedUint32x16: + v.Op = OpAMD64VPUNPCKLDQ512 + return true + case OpInterleaveLoGroupedUint32x8: + v.Op = OpAMD64VPUNPCKLDQ256 + return true + case OpInterleaveLoGroupedUint64x4: + v.Op = OpAMD64VPUNPCKLQDQ256 + return true + case OpInterleaveLoGroupedUint64x8: + v.Op = OpAMD64VPUNPCKLQDQ512 + return true + case OpInterleaveLoInt16x8: + v.Op = OpAMD64VPUNPCKLWD128 + return true + case OpInterleaveLoInt32x4: + v.Op = OpAMD64VPUNPCKLDQ128 + return true + case OpInterleaveLoInt64x2: + v.Op = OpAMD64VPUNPCKLQDQ128 + return true + case OpInterleaveLoUint16x8: + v.Op = OpAMD64VPUNPCKLWD128 + return true + case OpInterleaveLoUint32x4: + v.Op = OpAMD64VPUNPCKLDQ128 + return true + case OpInterleaveLoUint64x2: + v.Op = OpAMD64VPUNPCKLQDQ128 + return true case OpIsInBounds: return rewriteValueAMD64_OpIsInBounds(v) case OpIsNanFloat32x16: diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 4ce329e1a4..d75dc440d2 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -532,6 +532,42 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x32.GreaterEqual", opLen2(ssa.OpGreaterEqualUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x16.GreaterEqual", opLen2(ssa.OpGreaterEqualUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.GreaterEqual", opLen2(ssa.OpGreaterEqualUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.InterleaveHi", opLen2(ssa.OpInterleaveHiInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.InterleaveHi", opLen2(ssa.OpInterleaveHiInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.InterleaveHi", opLen2(ssa.OpInterleaveHiInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.InterleaveHi", opLen2(ssa.OpInterleaveHiUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.InterleaveHi", opLen2(ssa.OpInterleaveHiUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.InterleaveHi", opLen2(ssa.OpInterleaveHiUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x4.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x4.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.InterleaveLo", opLen2(ssa.OpInterleaveLoInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.InterleaveLo", opLen2(ssa.OpInterleaveLoInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.InterleaveLo", opLen2(ssa.OpInterleaveLoInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.InterleaveLo", opLen2(ssa.OpInterleaveLoUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.InterleaveLo", opLen2(ssa.OpInterleaveLoUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.InterleaveLo", opLen2(ssa.OpInterleaveLoUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.IsNan", opLen2(ssa.OpIsNanFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.IsNan", opLen2(ssa.OpIsNanFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.IsNan", opLen2(ssa.OpIsNanFloat32x16, types.TypeVec512), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index 556562b51a..27e67f4787 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -102,4 +102,21 @@ - go: PermuteConstantHiGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. - // NAME performs a grouped permutation of vector x using constant indices: \ No newline at end of file + // NAME performs a grouped permutation of vector x using constant indices: +- go: InterleaveHi + commutative: false + documentation: !string |- + // NAME interleaves the elements of the high halves of x and y. +- go: InterleaveLo + commutative: false + documentation: !string |- + // NAME interleaves the elements of the low halves of x and y. +- go: InterleaveHiGrouped + commutative: false + documentation: !string |- + // NAME interleaves the elements of the high half of each 128-bit subvector of x and y. +- go: InterleaveLoGrouped + commutative: false + documentation: !string |- + // NAME interleaves the elements of the low half of each 128-bit subvector of x and y. + diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 3d471ec480..eb14058a88 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -526,4 +526,41 @@ immOffset: 0 name: indices out: - - *256Or512any \ No newline at end of file + - *256Or512any + +- go: InterleaveHi + asm: VPUNPCKH(QDQ|DQ|WD|WB) + in: + - *128any + - *128any + inVariant: [] + out: + - *128any + +- go: InterleaveLo + asm: VPUNPCKL(QDQ|DQ|WD|WB) + in: + - *128any + - *128any + inVariant: [] + out: + - *128any + +- go: InterleaveHiGrouped + asm: VPUNPCKH(QDQ|DQ|WD|WB) + in: + - *256Or512any + - *256Or512any + inVariant: [] + out: + - *256Or512any + +- go: InterleaveLoGrouped + asm: VPUNPCKL(QDQ|DQ|WD|WB) + in: + - *256Or512any + - *256Or512any + inVariant: [] + out: + - *256Or512any + diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 3dcb5c6a27..98cfd55ac5 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -494,3 +494,27 @@ func TestMaskOpt512(t *testing.T) { checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0}) checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0}) } + +// flattenedTranspose tranposes x and y, regarded as a pair of 2x2 +// matrices, but then flattens the rows in order, i.e +// x: ABCD ==> a: A1B2 +// y: 1234 b: C3D4 +func flattenedTranspose(x, y simd.Int32x4) (a, b simd.Int32x4) { + return x.InterleaveLo(y), x.InterleaveHi(y) +} + +func TestFlattenedTranspose(t *testing.T) { + r := make([]int32, 4, 4) + s := make([]int32, 4, 4) + + x := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD}) + y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4}) + a, b := flattenedTranspose(x, y) + + a.StoreSlice(r) + b.StoreSlice(s) + + checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2}) + checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4}) + +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index bce30aa2cb..39552131bf 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -3078,6 +3078,194 @@ func (x Uint32x16) GreaterEqual(y Uint32x16) Mask32x16 // Asm: VPCMPUQ, CPU Feature: AVX512 func (x Uint64x8) GreaterEqual(y Uint64x8) Mask64x8 +/* InterleaveHi */ + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX +func (x Int16x8) InterleaveHi(y Int16x8) Int16x8 + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX +func (x Int32x4) InterleaveHi(y Int32x4) Int32x4 + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX +func (x Int64x2) InterleaveHi(y Int64x2) Int64x2 + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX +func (x Uint16x8) InterleaveHi(y Uint16x8) Uint16x8 + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX +func (x Uint32x4) InterleaveHi(y Uint32x4) Uint32x4 + +// InterleaveHi interleaves the elements of the high halves of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX +func (x Uint64x2) InterleaveHi(y Uint64x2) Uint64x2 + +/* InterleaveHiGrouped */ + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX2 +func (x Int16x16) InterleaveHiGrouped(y Int16x16) Int16x16 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX512 +func (x Int16x32) InterleaveHiGrouped(y Int16x32) Int16x32 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX2 +func (x Int32x8) InterleaveHiGrouped(y Int32x8) Int32x8 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX512 +func (x Int32x16) InterleaveHiGrouped(y Int32x16) Int32x16 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX2 +func (x Int64x4) InterleaveHiGrouped(y Int64x4) Int64x4 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX512 +func (x Int64x8) InterleaveHiGrouped(y Int64x8) Int64x8 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX2 +func (x Uint16x16) InterleaveHiGrouped(y Uint16x16) Uint16x16 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHWD, CPU Feature: AVX512 +func (x Uint16x32) InterleaveHiGrouped(y Uint16x32) Uint16x32 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX2 +func (x Uint32x8) InterleaveHiGrouped(y Uint32x8) Uint32x8 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHDQ, CPU Feature: AVX512 +func (x Uint32x16) InterleaveHiGrouped(y Uint32x16) Uint32x16 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX2 +func (x Uint64x4) InterleaveHiGrouped(y Uint64x4) Uint64x4 + +// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKHQDQ, CPU Feature: AVX512 +func (x Uint64x8) InterleaveHiGrouped(y Uint64x8) Uint64x8 + +/* InterleaveLo */ + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX +func (x Int16x8) InterleaveLo(y Int16x8) Int16x8 + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX +func (x Int32x4) InterleaveLo(y Int32x4) Int32x4 + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX +func (x Int64x2) InterleaveLo(y Int64x2) Int64x2 + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX +func (x Uint16x8) InterleaveLo(y Uint16x8) Uint16x8 + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX +func (x Uint32x4) InterleaveLo(y Uint32x4) Uint32x4 + +// InterleaveLo interleaves the elements of the low halves of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX +func (x Uint64x2) InterleaveLo(y Uint64x2) Uint64x2 + +/* InterleaveLoGrouped */ + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX2 +func (x Int16x16) InterleaveLoGrouped(y Int16x16) Int16x16 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX512 +func (x Int16x32) InterleaveLoGrouped(y Int16x32) Int16x32 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX2 +func (x Int32x8) InterleaveLoGrouped(y Int32x8) Int32x8 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX512 +func (x Int32x16) InterleaveLoGrouped(y Int32x16) Int32x16 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX2 +func (x Int64x4) InterleaveLoGrouped(y Int64x4) Int64x4 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX512 +func (x Int64x8) InterleaveLoGrouped(y Int64x8) Int64x8 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX2 +func (x Uint16x16) InterleaveLoGrouped(y Uint16x16) Uint16x16 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLWD, CPU Feature: AVX512 +func (x Uint16x32) InterleaveLoGrouped(y Uint16x32) Uint16x32 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX2 +func (x Uint32x8) InterleaveLoGrouped(y Uint32x8) Uint32x8 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLDQ, CPU Feature: AVX512 +func (x Uint32x16) InterleaveLoGrouped(y Uint32x16) Uint32x16 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX2 +func (x Uint64x4) InterleaveLoGrouped(y Uint64x4) Uint64x4 + +// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y. +// +// Asm: VPUNPCKLQDQ, CPU Feature: AVX512 +func (x Uint64x8) InterleaveLoGrouped(y Uint64x8) Uint64x8 + /* IsNan */ // IsNan checks if elements are NaN. Use as x.IsNan(x). diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go new file mode 100644 index 0000000000..4445a88f31 --- /dev/null +++ b/src/simd/shuffles_amd64.go @@ -0,0 +1,15 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package simd + +// FlattenedTranspose tranposes x and y, regarded as a pair of 2x2 +// matrices, but then flattens the rows in order, i.e +// x: ABCD ==> a: A1B2 +// y: 1234 b: C3D4 +func (x Int32x4) FlattenedTranspose(y Int32x4) (a, b Int32x4) { + return x.InterleaveLo(y), x.InterleaveHi(y) +} -- 2.52.0