From 5b0ef7fcdc18bcec16b50e4ebc220f3ee3a9a4cb Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 5 Aug 2025 19:42:12 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: add Expand This CL is generated by CL 693336. Change-Id: Ic1712d49fcad0544fa3c19b0249d8bc65b347104 Reviewed-on: https://go-review.googlesource.com/c/go/+/693375 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 36 ++ .../compile/internal/ssa/_gen/simdAMD64.rules | 30 + .../compile/internal/ssa/_gen/simdAMD64ops.go | 18 + .../internal/ssa/_gen/simdgenericOps.go | 30 + src/cmd/compile/internal/ssa/opGen.go | 450 +++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 540 ++++++++++++++++++ .../compile/internal/ssagen/simdintrinsics.go | 30 + src/simd/ops_amd64.go | 182 ++++++ src/simd/simd_test.go | 16 + 9 files changed, 1332 insertions(+) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 7a0a0be58f..b778cd7994 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -644,6 +644,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VCVTPS2UDQMasked128, ssa.OpAMD64VCVTPS2UDQMasked256, ssa.OpAMD64VCVTPS2UDQMasked512, + ssa.OpAMD64VEXPANDPSMasked128, + ssa.OpAMD64VEXPANDPSMasked256, + ssa.OpAMD64VEXPANDPSMasked512, + ssa.OpAMD64VEXPANDPDMasked128, + ssa.OpAMD64VEXPANDPDMasked256, + ssa.OpAMD64VEXPANDPDMasked512, + ssa.OpAMD64VPEXPANDBMasked128, + ssa.OpAMD64VPEXPANDBMasked256, + ssa.OpAMD64VPEXPANDBMasked512, + ssa.OpAMD64VPEXPANDWMasked128, + ssa.OpAMD64VPEXPANDWMasked256, + ssa.OpAMD64VPEXPANDWMasked512, + ssa.OpAMD64VPEXPANDDMasked128, + ssa.OpAMD64VPEXPANDDMasked256, + ssa.OpAMD64VPEXPANDDMasked512, + ssa.OpAMD64VPEXPANDQMasked128, + ssa.OpAMD64VPEXPANDQMasked256, + ssa.OpAMD64VPEXPANDQMasked512, ssa.OpAMD64VPOPCNTBMasked128, ssa.OpAMD64VPOPCNTBMasked256, ssa.OpAMD64VPOPCNTBMasked512, @@ -1229,6 +1247,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VDIVPDMasked128, ssa.OpAMD64VDIVPDMasked256, ssa.OpAMD64VDIVPDMasked512, + ssa.OpAMD64VEXPANDPSMasked128, + ssa.OpAMD64VEXPANDPSMasked256, + ssa.OpAMD64VEXPANDPSMasked512, + ssa.OpAMD64VEXPANDPDMasked128, + ssa.OpAMD64VEXPANDPDMasked256, + ssa.OpAMD64VEXPANDPDMasked512, + ssa.OpAMD64VPEXPANDBMasked128, + ssa.OpAMD64VPEXPANDBMasked256, + ssa.OpAMD64VPEXPANDBMasked512, + ssa.OpAMD64VPEXPANDWMasked128, + ssa.OpAMD64VPEXPANDWMasked256, + ssa.OpAMD64VPEXPANDWMasked512, + ssa.OpAMD64VPEXPANDDMasked128, + ssa.OpAMD64VPEXPANDDMasked256, + ssa.OpAMD64VPEXPANDDMasked512, + ssa.OpAMD64VPEXPANDQMasked128, + ssa.OpAMD64VPEXPANDQMasked256, + ssa.OpAMD64VPEXPANDQMasked512, ssa.OpAMD64VFMADD213PSMasked128, ssa.OpAMD64VFMADD213PSMasked256, ssa.OpAMD64VFMADD213PSMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 316db1b841..ae29a9117e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -385,6 +385,36 @@ (EqualMaskedUint64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPUQMasked128 [0] x y (VPMOVVec64x2ToM mask))) (EqualMaskedUint64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPUQMasked256 [0] x y (VPMOVVec64x4ToM mask))) (EqualMaskedUint64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPUQMasked512 [0] x y (VPMOVVec64x8ToM mask))) +(ExpandFloat32x4 x mask) => (VEXPANDPSMasked128 x (VPMOVVec32x4ToM mask)) +(ExpandFloat32x8 x mask) => (VEXPANDPSMasked256 x (VPMOVVec32x8ToM mask)) +(ExpandFloat32x16 x mask) => (VEXPANDPSMasked512 x (VPMOVVec32x16ToM mask)) +(ExpandFloat64x2 x mask) => (VEXPANDPDMasked128 x (VPMOVVec64x2ToM mask)) +(ExpandFloat64x4 x mask) => (VEXPANDPDMasked256 x (VPMOVVec64x4ToM mask)) +(ExpandFloat64x8 x mask) => (VEXPANDPDMasked512 x (VPMOVVec64x8ToM mask)) +(ExpandInt8x16 x mask) => (VPEXPANDBMasked128 x (VPMOVVec8x16ToM mask)) +(ExpandInt8x32 x mask) => (VPEXPANDBMasked256 x (VPMOVVec8x32ToM mask)) +(ExpandInt8x64 x mask) => (VPEXPANDBMasked512 x (VPMOVVec8x64ToM mask)) +(ExpandInt16x8 x mask) => (VPEXPANDWMasked128 x (VPMOVVec16x8ToM mask)) +(ExpandInt16x16 x mask) => (VPEXPANDWMasked256 x (VPMOVVec16x16ToM mask)) +(ExpandInt16x32 x mask) => (VPEXPANDWMasked512 x (VPMOVVec16x32ToM mask)) +(ExpandInt32x4 x mask) => (VPEXPANDDMasked128 x (VPMOVVec32x4ToM mask)) +(ExpandInt32x8 x mask) => (VPEXPANDDMasked256 x (VPMOVVec32x8ToM mask)) +(ExpandInt32x16 x mask) => (VPEXPANDDMasked512 x (VPMOVVec32x16ToM mask)) +(ExpandInt64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM mask)) +(ExpandInt64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM mask)) +(ExpandInt64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM mask)) +(ExpandUint8x16 x mask) => (VPEXPANDBMasked128 x (VPMOVVec8x16ToM mask)) +(ExpandUint8x32 x mask) => (VPEXPANDBMasked256 x (VPMOVVec8x32ToM mask)) +(ExpandUint8x64 x mask) => (VPEXPANDBMasked512 x (VPMOVVec8x64ToM mask)) +(ExpandUint16x8 x mask) => (VPEXPANDWMasked128 x (VPMOVVec16x8ToM mask)) +(ExpandUint16x16 x mask) => (VPEXPANDWMasked256 x (VPMOVVec16x16ToM mask)) +(ExpandUint16x32 x mask) => (VPEXPANDWMasked512 x (VPMOVVec16x32ToM mask)) +(ExpandUint32x4 x mask) => (VPEXPANDDMasked128 x (VPMOVVec32x4ToM mask)) +(ExpandUint32x8 x mask) => (VPEXPANDDMasked256 x (VPMOVVec32x8ToM mask)) +(ExpandUint32x16 x mask) => (VPEXPANDDMasked512 x (VPMOVVec32x16ToM mask)) +(ExpandUint64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM mask)) +(ExpandUint64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM mask)) +(ExpandUint64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM mask)) (FloorFloat32x4 x) => (VROUNDPS128 [1] x) (FloorFloat32x8 x) => (VROUNDPS256 [1] x) (FloorFloat64x2 x) => (VROUNDPD128 [1] x) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 591f8a5bca..ccda39f59d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -49,6 +49,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VDIVPSMasked128", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VDIVPSMasked256", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VDIVPSMasked512", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VEXPANDPDMasked128", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VEXPANDPDMasked256", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VEXPANDPDMasked512", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VEXPANDPSMasked128", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VEXPANDPSMasked256", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VEXPANDPSMasked512", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VFMADD213PD128", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VFMADD213PD256", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VFMADD213PD512", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec512", resultInArg0: true}, @@ -357,6 +363,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPERMWMasked128", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPERMWMasked256", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMWMasked512", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPEXPANDBMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPEXPANDBMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPEXPANDBMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPEXPANDDMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPEXPANDDMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPEXPANDDMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPEXPANDQMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPEXPANDQMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPEXPANDQMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPEXPANDWMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPEXPANDWMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPEXPANDWMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPHADDD128", argLength: 2, reg: v21, asm: "VPHADDD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPHADDD256", argLength: 2, reg: v21, asm: "VPHADDD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPHADDSW128", argLength: 2, reg: v21, asm: "VPHADDSW", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index e132b058a4..d0a4a494b1 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -364,6 +364,36 @@ func simdGenericOps() []opData { {name: "EqualUint64x2", argLength: 2, commutative: true}, {name: "EqualUint64x4", argLength: 2, commutative: true}, {name: "EqualUint64x8", argLength: 2, commutative: true}, + {name: "ExpandFloat32x4", argLength: 2, commutative: false}, + {name: "ExpandFloat32x8", argLength: 2, commutative: false}, + {name: "ExpandFloat32x16", argLength: 2, commutative: false}, + {name: "ExpandFloat64x2", argLength: 2, commutative: false}, + {name: "ExpandFloat64x4", argLength: 2, commutative: false}, + {name: "ExpandFloat64x8", argLength: 2, commutative: false}, + {name: "ExpandInt8x16", argLength: 2, commutative: false}, + {name: "ExpandInt8x32", argLength: 2, commutative: false}, + {name: "ExpandInt8x64", argLength: 2, commutative: false}, + {name: "ExpandInt16x8", argLength: 2, commutative: false}, + {name: "ExpandInt16x16", argLength: 2, commutative: false}, + {name: "ExpandInt16x32", argLength: 2, commutative: false}, + {name: "ExpandInt32x4", argLength: 2, commutative: false}, + {name: "ExpandInt32x8", argLength: 2, commutative: false}, + {name: "ExpandInt32x16", argLength: 2, commutative: false}, + {name: "ExpandInt64x2", argLength: 2, commutative: false}, + {name: "ExpandInt64x4", argLength: 2, commutative: false}, + {name: "ExpandInt64x8", argLength: 2, commutative: false}, + {name: "ExpandUint8x16", argLength: 2, commutative: false}, + {name: "ExpandUint8x32", argLength: 2, commutative: false}, + {name: "ExpandUint8x64", argLength: 2, commutative: false}, + {name: "ExpandUint16x8", argLength: 2, commutative: false}, + {name: "ExpandUint16x16", argLength: 2, commutative: false}, + {name: "ExpandUint16x32", argLength: 2, commutative: false}, + {name: "ExpandUint32x4", argLength: 2, commutative: false}, + {name: "ExpandUint32x8", argLength: 2, commutative: false}, + {name: "ExpandUint32x16", argLength: 2, commutative: false}, + {name: "ExpandUint64x2", argLength: 2, commutative: false}, + {name: "ExpandUint64x4", argLength: 2, commutative: false}, + {name: "ExpandUint64x8", argLength: 2, commutative: false}, {name: "FloorFloat32x4", argLength: 1, commutative: false}, {name: "FloorFloat32x8", argLength: 1, commutative: false}, {name: "FloorFloat64x2", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index b39311cd90..2fafe10ea5 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1268,6 +1268,12 @@ const ( OpAMD64VDIVPSMasked128 OpAMD64VDIVPSMasked256 OpAMD64VDIVPSMasked512 + OpAMD64VEXPANDPDMasked128 + OpAMD64VEXPANDPDMasked256 + OpAMD64VEXPANDPDMasked512 + OpAMD64VEXPANDPSMasked128 + OpAMD64VEXPANDPSMasked256 + OpAMD64VEXPANDPSMasked512 OpAMD64VFMADD213PD128 OpAMD64VFMADD213PD256 OpAMD64VFMADD213PD512 @@ -1576,6 +1582,18 @@ const ( OpAMD64VPERMWMasked128 OpAMD64VPERMWMasked256 OpAMD64VPERMWMasked512 + OpAMD64VPEXPANDBMasked128 + OpAMD64VPEXPANDBMasked256 + OpAMD64VPEXPANDBMasked512 + OpAMD64VPEXPANDDMasked128 + OpAMD64VPEXPANDDMasked256 + OpAMD64VPEXPANDDMasked512 + OpAMD64VPEXPANDQMasked128 + OpAMD64VPEXPANDQMasked256 + OpAMD64VPEXPANDQMasked512 + OpAMD64VPEXPANDWMasked128 + OpAMD64VPEXPANDWMasked256 + OpAMD64VPEXPANDWMasked512 OpAMD64VPHADDD128 OpAMD64VPHADDD256 OpAMD64VPHADDSW128 @@ -4925,6 +4943,36 @@ const ( OpEqualUint64x2 OpEqualUint64x4 OpEqualUint64x8 + OpExpandFloat32x4 + OpExpandFloat32x8 + OpExpandFloat32x16 + OpExpandFloat64x2 + OpExpandFloat64x4 + OpExpandFloat64x8 + OpExpandInt8x16 + OpExpandInt8x32 + OpExpandInt8x64 + OpExpandInt16x8 + OpExpandInt16x16 + OpExpandInt16x32 + OpExpandInt32x4 + OpExpandInt32x8 + OpExpandInt32x16 + OpExpandInt64x2 + OpExpandInt64x4 + OpExpandInt64x8 + OpExpandUint8x16 + OpExpandUint8x32 + OpExpandUint8x64 + OpExpandUint16x8 + OpExpandUint16x16 + OpExpandUint16x32 + OpExpandUint32x4 + OpExpandUint32x8 + OpExpandUint32x16 + OpExpandUint64x2 + OpExpandUint64x4 + OpExpandUint64x8 OpFloorFloat32x4 OpFloorFloat32x8 OpFloorFloat64x2 @@ -20065,6 +20113,90 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VEXPANDPDMasked128", + argLen: 2, + asm: x86.AVEXPANDPD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VEXPANDPDMasked256", + argLen: 2, + asm: x86.AVEXPANDPD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VEXPANDPDMasked512", + argLen: 2, + asm: x86.AVEXPANDPD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VEXPANDPSMasked128", + argLen: 2, + asm: x86.AVEXPANDPS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VEXPANDPSMasked256", + argLen: 2, + asm: x86.AVEXPANDPS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VEXPANDPSMasked512", + argLen: 2, + asm: x86.AVEXPANDPS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VFMADD213PD128", argLen: 3, @@ -24788,6 +24920,174 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPEXPANDBMasked128", + argLen: 2, + asm: x86.AVPEXPANDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDBMasked256", + argLen: 2, + asm: x86.AVPEXPANDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDBMasked512", + argLen: 2, + asm: x86.AVPEXPANDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDDMasked128", + argLen: 2, + asm: x86.AVPEXPANDD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDDMasked256", + argLen: 2, + asm: x86.AVPEXPANDD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDDMasked512", + argLen: 2, + asm: x86.AVPEXPANDD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDQMasked128", + argLen: 2, + asm: x86.AVPEXPANDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDQMasked256", + argLen: 2, + asm: x86.AVPEXPANDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDQMasked512", + argLen: 2, + asm: x86.AVPEXPANDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDWMasked128", + argLen: 2, + asm: x86.AVPEXPANDW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDWMasked256", + argLen: 2, + asm: x86.AVPEXPANDW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPEXPANDWMasked512", + argLen: 2, + asm: x86.AVPEXPANDW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPHADDD128", argLen: 2, @@ -64829,6 +65129,156 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "ExpandFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandFloat32x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "ExpandFloat64x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandFloat64x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt8x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt8x32", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt8x64", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt16x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt16x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt16x32", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt32x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt32x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt32x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt64x2", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt64x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandInt64x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint8x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint8x32", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint8x64", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint16x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint16x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint16x32", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint32x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint32x8", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint32x16", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint64x2", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint64x4", + argLen: 2, + generic: true, + }, + { + name: "ExpandUint64x8", + argLen: 2, + generic: true, + }, { name: "FloorFloat32x4", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 91fd3fb470..6b63b70245 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1754,6 +1754,66 @@ func rewriteValueAMD64(v *Value) bool { return true case OpEqualUint8x64: return rewriteValueAMD64_OpEqualUint8x64(v) + case OpExpandFloat32x16: + return rewriteValueAMD64_OpExpandFloat32x16(v) + case OpExpandFloat32x4: + return rewriteValueAMD64_OpExpandFloat32x4(v) + case OpExpandFloat32x8: + return rewriteValueAMD64_OpExpandFloat32x8(v) + case OpExpandFloat64x2: + return rewriteValueAMD64_OpExpandFloat64x2(v) + case OpExpandFloat64x4: + return rewriteValueAMD64_OpExpandFloat64x4(v) + case OpExpandFloat64x8: + return rewriteValueAMD64_OpExpandFloat64x8(v) + case OpExpandInt16x16: + return rewriteValueAMD64_OpExpandInt16x16(v) + case OpExpandInt16x32: + return rewriteValueAMD64_OpExpandInt16x32(v) + case OpExpandInt16x8: + return rewriteValueAMD64_OpExpandInt16x8(v) + case OpExpandInt32x16: + return rewriteValueAMD64_OpExpandInt32x16(v) + case OpExpandInt32x4: + return rewriteValueAMD64_OpExpandInt32x4(v) + case OpExpandInt32x8: + return rewriteValueAMD64_OpExpandInt32x8(v) + case OpExpandInt64x2: + return rewriteValueAMD64_OpExpandInt64x2(v) + case OpExpandInt64x4: + return rewriteValueAMD64_OpExpandInt64x4(v) + case OpExpandInt64x8: + return rewriteValueAMD64_OpExpandInt64x8(v) + case OpExpandInt8x16: + return rewriteValueAMD64_OpExpandInt8x16(v) + case OpExpandInt8x32: + return rewriteValueAMD64_OpExpandInt8x32(v) + case OpExpandInt8x64: + return rewriteValueAMD64_OpExpandInt8x64(v) + case OpExpandUint16x16: + return rewriteValueAMD64_OpExpandUint16x16(v) + case OpExpandUint16x32: + return rewriteValueAMD64_OpExpandUint16x32(v) + case OpExpandUint16x8: + return rewriteValueAMD64_OpExpandUint16x8(v) + case OpExpandUint32x16: + return rewriteValueAMD64_OpExpandUint32x16(v) + case OpExpandUint32x4: + return rewriteValueAMD64_OpExpandUint32x4(v) + case OpExpandUint32x8: + return rewriteValueAMD64_OpExpandUint32x8(v) + case OpExpandUint64x2: + return rewriteValueAMD64_OpExpandUint64x2(v) + case OpExpandUint64x4: + return rewriteValueAMD64_OpExpandUint64x4(v) + case OpExpandUint64x8: + return rewriteValueAMD64_OpExpandUint64x8(v) + case OpExpandUint8x16: + return rewriteValueAMD64_OpExpandUint8x16(v) + case OpExpandUint8x32: + return rewriteValueAMD64_OpExpandUint8x32(v) + case OpExpandUint8x64: + return rewriteValueAMD64_OpExpandUint8x64(v) case OpFMA: return rewriteValueAMD64_OpFMA(v) case OpFloor: @@ -34479,6 +34539,486 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpExpandFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat32x16 x mask) + // result: (VEXPANDPSMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat32x4 x mask) + // result: (VEXPANDPSMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat32x8 x mask) + // result: (VEXPANDPSMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat64x2 x mask) + // result: (VEXPANDPDMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandFloat64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat64x4 x mask) + // result: (VEXPANDPDMasked256 x (VPMOVVec64x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandFloat64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandFloat64x8 x mask) + // result: (VEXPANDPDMasked512 x (VPMOVVec64x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VEXPANDPDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt16x16 x mask) + // result: (VPEXPANDWMasked256 x (VPMOVVec16x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt16x32 x mask) + // result: (VPEXPANDWMasked512 x (VPMOVVec16x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt16x8 x mask) + // result: (VPEXPANDWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt32x16 x mask) + // result: (VPEXPANDDMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt32x4 x mask) + // result: (VPEXPANDDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt32x8 x mask) + // result: (VPEXPANDDMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt64x2 x mask) + // result: (VPEXPANDQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt64x4 x mask) + // result: (VPEXPANDQMasked256 x (VPMOVVec64x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt64x8 x mask) + // result: (VPEXPANDQMasked512 x (VPMOVVec64x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt8x16 x mask) + // result: (VPEXPANDBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt8x32 x mask) + // result: (VPEXPANDBMasked256 x (VPMOVVec8x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandInt8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandInt8x64 x mask) + // result: (VPEXPANDBMasked512 x (VPMOVVec8x64ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint16x16 x mask) + // result: (VPEXPANDWMasked256 x (VPMOVVec16x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint16x32 x mask) + // result: (VPEXPANDWMasked512 x (VPMOVVec16x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint16x8 x mask) + // result: (VPEXPANDWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint32x16 x mask) + // result: (VPEXPANDDMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint32x4 x mask) + // result: (VPEXPANDDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint32x8 x mask) + // result: (VPEXPANDDMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint64x2 x mask) + // result: (VPEXPANDQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint64x4 x mask) + // result: (VPEXPANDQMasked256 x (VPMOVVec64x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint64x8 x mask) + // result: (VPEXPANDQMasked512 x (VPMOVVec64x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint8x16 x mask) + // result: (VPEXPANDBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint8x32 x mask) + // result: (VPEXPANDBMasked256 x (VPMOVVec8x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpExpandUint8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ExpandUint8x64 x mask) + // result: (VPEXPANDBMasked512 x (VPMOVVec8x64ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPEXPANDBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpFMA(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 873bb8e2de..0f65b4500a 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -396,6 +396,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Expand", opLen2(ssa.OpExpandFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.Expand", opLen2(ssa.OpExpandFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.Expand", opLen2(ssa.OpExpandFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Expand", opLen2(ssa.OpExpandFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.Expand", opLen2(ssa.OpExpandFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.Expand", opLen2(ssa.OpExpandFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Expand", opLen2(ssa.OpExpandInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.Expand", opLen2(ssa.OpExpandInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Expand", opLen2(ssa.OpExpandInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Expand", opLen2(ssa.OpExpandInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.Expand", opLen2(ssa.OpExpandInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Expand", opLen2(ssa.OpExpandInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.Expand", opLen2(ssa.OpExpandInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.Expand", opLen2(ssa.OpExpandInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.Expand", opLen2(ssa.OpExpandInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.Expand", opLen2(ssa.OpExpandInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.Expand", opLen2(ssa.OpExpandInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.Expand", opLen2(ssa.OpExpandInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Expand", opLen2(ssa.OpExpandUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.Expand", opLen2(ssa.OpExpandUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.Expand", opLen2(ssa.OpExpandUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Expand", opLen2(ssa.OpExpandUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.Expand", opLen2(ssa.OpExpandUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.Expand", opLen2(ssa.OpExpandUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.Expand", opLen2(ssa.OpExpandUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.Expand", opLen2(ssa.OpExpandUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.Expand", opLen2(ssa.OpExpandUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Expand", opLen2(ssa.OpExpandUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.Expand", opLen2(ssa.OpExpandUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.Expand", opLen2(ssa.OpExpandUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Floor", opLen1(ssa.OpFloorFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64) diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 5eb8fea476..2138271769 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -2399,6 +2399,188 @@ func (x Uint64x4) EqualMasked(y Uint64x4, mask Mask64x4) Mask64x4 // Asm: VPCMPUQ, CPU Feature: AVX512F func (x Uint64x8) EqualMasked(y Uint64x8, mask Mask64x8) Mask64x8 +/* Expand */ + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPS, CPU Feature: AVX512F +func (x Float32x4) Expand(mask Mask32x4) Float32x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPS, CPU Feature: AVX512F +func (x Float32x8) Expand(mask Mask32x8) Float32x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPS, CPU Feature: AVX512F +func (x Float32x16) Expand(mask Mask32x16) Float32x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPD, CPU Feature: AVX512F +func (x Float64x2) Expand(mask Mask64x2) Float64x2 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPD, CPU Feature: AVX512F +func (x Float64x4) Expand(mask Mask64x4) Float64x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VEXPANDPD, CPU Feature: AVX512F +func (x Float64x8) Expand(mask Mask64x8) Float64x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Int8x16) Expand(mask Mask8x16) Int8x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Int8x32) Expand(mask Mask8x32) Int8x32 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Int8x64) Expand(mask Mask8x64) Int8x64 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Int16x8) Expand(mask Mask16x8) Int16x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Int16x16) Expand(mask Mask16x16) Int16x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Int16x32) Expand(mask Mask16x32) Int16x32 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Int32x4) Expand(mask Mask32x4) Int32x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Int32x8) Expand(mask Mask32x8) Int32x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Int32x16) Expand(mask Mask32x16) Int32x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Int64x2) Expand(mask Mask64x2) Int64x2 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Int64x4) Expand(mask Mask64x4) Int64x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Int64x8) Expand(mask Mask64x8) Int64x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Uint8x16) Expand(mask Mask8x16) Uint8x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Uint8x32) Expand(mask Mask8x32) Uint8x32 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2 +func (x Uint8x64) Expand(mask Mask8x64) Uint8x64 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Uint16x8) Expand(mask Mask16x8) Uint16x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Uint16x16) Expand(mask Mask16x16) Uint16x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2 +func (x Uint16x32) Expand(mask Mask16x32) Uint16x32 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Uint32x4) Expand(mask Mask32x4) Uint32x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Uint32x8) Expand(mask Mask32x8) Uint32x8 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDD, CPU Feature: AVX512F +func (x Uint32x16) Expand(mask Mask32x16) Uint32x16 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Uint64x2) Expand(mask Mask64x2) Uint64x2 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Uint64x4) Expand(mask Mask64x4) Uint64x4 + +// Expand performs an expansion on a vector x whose elements are packed to lower parts. +// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. +// +// Asm: VPEXPANDQ, CPU Feature: AVX512F +func (x Uint64x8) Expand(mask Mask64x8) Uint64x8 + /* Floor */ // Floor rounds elements down to the nearest integer. diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 5718347838..9e9b45b5b8 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -187,6 +187,22 @@ func TestCompress(t *testing.T) { } } +func TestExpand(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + v3400 := simd.LoadInt32x4Slice([]int32{3, 4, 0, 0}) + v0101 := simd.LoadInt32x4Slice([]int32{0, -1, 0, -1}) + v2400 := v3400.Expand(v0101.AsMask32x4()) + got := make([]int32, 4) + v2400.StoreSlice(got) + want := []int32{0, 3, 0, 4} + if !slices.Equal(got, want) { + t.Errorf("want and got differ, want=%v, got=%v", want, got) + } +} + func TestPairDotProdAccumulate(t *testing.T) { if !simd.HasAVX512GFNI() { // TODO: this function is actually VNNI, let's implement and call the right check. -- 2.52.0