From ddb689c7bb681023491109c7d9673f389d6e06ee Mon Sep 17 00:00:00 2001 From: David Chase Date: Tue, 5 Aug 2025 17:34:05 -0400 Subject: [PATCH] [dev.simd] simd, cmd/compile: generated code for Broadcast Generated by simdgen CL 693599 This turned out to require some additional work in other places, including filling in missing methods (use OverwriteBase to get FP versions). Also includes a test. Change-Id: I2efe8967837834745f9cae661d4d4dcbb5390b6f Reviewed-on: https://go-review.googlesource.com/c/go/+/693758 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/amd64/simdssa.go | 59 +- .../compile/internal/ssa/_gen/simdAMD64.rules | 62 ++ .../compile/internal/ssa/_gen/simdAMD64ops.go | 38 +- .../internal/ssa/_gen/simdgenericOps.go | 62 ++ src/cmd/compile/internal/ssa/opGen.go | 887 +++++++++++++++++- src/cmd/compile/internal/ssa/rewriteAMD64.go | 636 +++++++++++++ .../compile/internal/ssagen/simdintrinsics.go | 62 ++ src/simd/genfiles.go | 79 +- src/simd/ops_amd64.go | 446 +++++++++ src/simd/simd_test.go | 12 + src/simd/slice_amd64.go | 270 ++++++ 11 files changed, 2575 insertions(+), 38 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index e6bbdc03de..73a947a88a 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -24,6 +24,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ512, + ssa.OpAMD64VBROADCASTSS128, + ssa.OpAMD64VPBROADCASTQ128, + ssa.OpAMD64VPBROADCASTB128, + ssa.OpAMD64VPBROADCASTW128, + ssa.OpAMD64VPBROADCASTD128, + ssa.OpAMD64VBROADCASTSS256, + ssa.OpAMD64VBROADCASTSD256, + ssa.OpAMD64VPBROADCASTB256, + ssa.OpAMD64VPBROADCASTW256, + ssa.OpAMD64VPBROADCASTD256, + ssa.OpAMD64VPBROADCASTQ256, + ssa.OpAMD64VBROADCASTSS512, + ssa.OpAMD64VBROADCASTSD512, + ssa.OpAMD64VPBROADCASTB512, + ssa.OpAMD64VPBROADCASTW512, + ssa.OpAMD64VPBROADCASTD512, + ssa.OpAMD64VPBROADCASTQ512, ssa.OpAMD64VCVTTPS2DQ128, ssa.OpAMD64VCVTTPS2DQ256, ssa.OpAMD64VCVTTPS2DQ512, @@ -624,6 +641,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128, ssa.OpAMD64VPABSQMasked256, ssa.OpAMD64VPABSQMasked512, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VPBROADCASTQMasked128, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VBROADCASTSSMasked256, + ssa.OpAMD64VBROADCASTSDMasked256, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VPBROADCASTDMasked256, + ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VBROADCASTSSMasked512, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTDMasked512, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VCOMPRESSPSMasked128, ssa.OpAMD64VCOMPRESSPSMasked256, ssa.OpAMD64VCOMPRESSPSMasked512, @@ -1104,10 +1138,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSRLQMasked512: p = simdVfpkv(s, v) - case ssa.OpAMD64VPINSRB128, - ssa.OpAMD64VPINSRW128, - ssa.OpAMD64VPINSRD128, - ssa.OpAMD64VPINSRQ128: + case ssa.OpAMD64VPINSRD128, + ssa.OpAMD64VPINSRQ128, + ssa.OpAMD64VPINSRB128, + ssa.OpAMD64VPINSRW128: p = simdVgpvImm8(s, v) case ssa.OpAMD64VPEXTRB128, @@ -1221,6 +1255,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128, ssa.OpAMD64VPAVGWMasked256, ssa.OpAMD64VPAVGWMasked512, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VPBROADCASTQMasked128, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VBROADCASTSSMasked256, + ssa.OpAMD64VBROADCASTSDMasked256, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VPBROADCASTDMasked256, + ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VBROADCASTSSMasked512, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTDMasked512, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VRNDSCALEPSMasked128, ssa.OpAMD64VRNDSCALEPSMasked256, ssa.OpAMD64VRNDSCALEPSMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 80cddaae79..e7c5a1a97d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -228,6 +228,66 @@ (AverageMaskedUint16x8 x y mask) => (VPAVGWMasked128 x y (VPMOVVec16x8ToM mask)) (AverageMaskedUint16x16 x y mask) => (VPAVGWMasked256 x y (VPMOVVec16x16ToM mask)) (AverageMaskedUint16x32 x y mask) => (VPAVGWMasked512 x y (VPMOVVec16x32ToM mask)) +(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...) +(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast128MaskedFloat32x4 x mask) => (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM mask)) +(Broadcast128MaskedFloat64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) +(Broadcast128MaskedInt8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM mask)) +(Broadcast128MaskedInt16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM mask)) +(Broadcast128MaskedInt32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM mask)) +(Broadcast128MaskedInt64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) +(Broadcast128MaskedUint8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM mask)) +(Broadcast128MaskedUint16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM mask)) +(Broadcast128MaskedUint32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM mask)) +(Broadcast128MaskedUint64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) +(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...) +(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...) +(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast256MaskedFloat32x4 x mask) => (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM mask)) +(Broadcast256MaskedFloat64x2 x mask) => (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM mask)) +(Broadcast256MaskedInt8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM mask)) +(Broadcast256MaskedInt16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM mask)) +(Broadcast256MaskedInt32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM mask)) +(Broadcast256MaskedInt64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM mask)) +(Broadcast256MaskedUint8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM mask)) +(Broadcast256MaskedUint16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM mask)) +(Broadcast256MaskedUint32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM mask)) +(Broadcast256MaskedUint64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM mask)) +(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...) +(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...) +(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...) +(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...) +(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast512MaskedFloat32x4 x mask) => (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM mask)) +(Broadcast512MaskedFloat64x2 x mask) => (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM mask)) +(Broadcast512MaskedInt8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM mask)) +(Broadcast512MaskedInt16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM mask)) +(Broadcast512MaskedInt32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM mask)) +(Broadcast512MaskedInt64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM mask)) +(Broadcast512MaskedUint8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM mask)) +(Broadcast512MaskedUint16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM mask)) +(Broadcast512MaskedUint32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM mask)) +(Broadcast512MaskedUint64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM mask)) (CeilFloat32x4 x) => (VROUNDPS128 [2] x) (CeilFloat32x8 x) => (VROUNDPS256 [2] x) (CeilFloat64x2 x) => (VROUNDPD128 [2] x) @@ -1396,6 +1456,8 @@ (ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM mask)) (ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM mask)) (ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM mask)) +(SetElemFloat32x4 ...) => (VPINSRD128 ...) +(SetElemFloat64x2 ...) => (VPINSRQ128 ...) (SetElemInt8x16 ...) => (VPINSRB128 ...) (SetElemInt16x8 ...) => (VPINSRW128 ...) (SetElemInt32x4 ...) => (VPINSRD128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index afea4c0a46..5d388a4531 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -20,6 +20,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VADDSUBPD256", argLength: 2, reg: v21, asm: "VADDSUBPD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VBROADCASTSD256", argLength: 1, reg: v11, asm: "VBROADCASTSD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VBROADCASTSD512", argLength: 1, reg: w11, asm: "VBROADCASTSD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VBROADCASTSDMasked256", argLength: 2, reg: wkw, asm: "VBROADCASTSD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VBROADCASTSDMasked512", argLength: 2, reg: wkw, asm: "VBROADCASTSD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VBROADCASTSS128", argLength: 1, reg: v11, asm: "VBROADCASTSS", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VBROADCASTSS256", argLength: 1, reg: v11, asm: "VBROADCASTSS", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VBROADCASTSS512", argLength: 1, reg: w11, asm: "VBROADCASTSS", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VBROADCASTSSMasked128", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VBROADCASTSSMasked256", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VBROADCASTSSMasked512", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VCOMPRESSPDMasked128", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VCOMPRESSPDMasked256", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VCOMPRESSPDMasked512", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -252,6 +262,30 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPBLENDMWMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPBLENDVB128", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPBLENDVB256", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTB128", argLength: 1, reg: v11, asm: "VPBROADCASTB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTB256", argLength: 1, reg: v11, asm: "VPBROADCASTB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTB512", argLength: 1, reg: w11, asm: "VPBROADCASTB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTBMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTBMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTBMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTD128", argLength: 1, reg: v11, asm: "VPBROADCASTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTD256", argLength: 1, reg: v11, asm: "VPBROADCASTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTD512", argLength: 1, reg: w11, asm: "VPBROADCASTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTDMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTDMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTDMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTQ128", argLength: 1, reg: v11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTQ256", argLength: 1, reg: v11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTQ512", argLength: 1, reg: w11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTQMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTQMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTQMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTW128", argLength: 1, reg: v11, asm: "VPBROADCASTW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTW256", argLength: 1, reg: v11, asm: "VPBROADCASTW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTW512", argLength: 1, reg: w11, asm: "VPBROADCASTW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPBROADCASTWMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPBROADCASTWMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPBROADCASTWMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCMPEQB128", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPCMPEQB256", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false}, @@ -1000,10 +1034,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, - {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, - {name: "VPINSRW128", argLength: 2, reg: vgpv, asm: "VPINSRW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPINSRW128", argLength: 2, reg: vgpv, asm: "VPINSRW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VINSERTF128256", argLength: 2, reg: v21, asm: "VINSERTF128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VINSERTF64X4512", argLength: 2, reg: w21, asm: "VINSERTF64X4", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index fea701e174..f120dcddd0 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -232,6 +232,66 @@ func simdGenericOps() []opData { {name: "AverageUint16x8", argLength: 2, commutative: true}, {name: "AverageUint16x16", argLength: 2, commutative: true}, {name: "AverageUint16x32", argLength: 2, commutative: true}, + {name: "Broadcast128Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast128Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast128Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast128Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast128Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast128Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast128MaskedFloat32x4", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedFloat64x2", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedInt8x16", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedInt16x8", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedInt32x4", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedInt64x2", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedUint8x16", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedUint16x8", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedUint32x4", argLength: 2, commutative: false}, + {name: "Broadcast128MaskedUint64x2", argLength: 2, commutative: false}, + {name: "Broadcast128Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast128Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast128Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast128Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast256Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast256Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast256Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast256Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast256Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast256Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast256MaskedFloat32x4", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedFloat64x2", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedInt8x16", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedInt16x8", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedInt32x4", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedInt64x2", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedUint8x16", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedUint16x8", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedUint32x4", argLength: 2, commutative: false}, + {name: "Broadcast256MaskedUint64x2", argLength: 2, commutative: false}, + {name: "Broadcast256Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast256Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast256Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast256Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast512Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast512Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast512Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast512Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast512Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast512Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast512MaskedFloat32x4", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedFloat64x2", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedInt8x16", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedInt16x8", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedInt32x4", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedInt64x2", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedUint8x16", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedUint16x8", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedUint32x4", argLength: 2, commutative: false}, + {name: "Broadcast512MaskedUint64x2", argLength: 2, commutative: false}, + {name: "Broadcast512Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast512Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast512Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast512Uint64x2", argLength: 1, commutative: false}, {name: "CeilFloat32x4", argLength: 1, commutative: false}, {name: "CeilFloat32x8", argLength: 1, commutative: false}, {name: "CeilFloat64x2", argLength: 1, commutative: false}, @@ -1812,6 +1872,8 @@ func simdGenericOps() []opData { {name: "RoundToEvenScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemInt16x8", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemInt32x4", argLength: 2, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 77527c83b8..6e0ffd1540 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1242,6 +1242,16 @@ const ( OpAMD64VADDSUBPD256 OpAMD64VADDSUBPS128 OpAMD64VADDSUBPS256 + OpAMD64VBROADCASTSD256 + OpAMD64VBROADCASTSD512 + OpAMD64VBROADCASTSDMasked256 + OpAMD64VBROADCASTSDMasked512 + OpAMD64VBROADCASTSS128 + OpAMD64VBROADCASTSS256 + OpAMD64VBROADCASTSS512 + OpAMD64VBROADCASTSSMasked128 + OpAMD64VBROADCASTSSMasked256 + OpAMD64VBROADCASTSSMasked512 OpAMD64VCOMPRESSPDMasked128 OpAMD64VCOMPRESSPDMasked256 OpAMD64VCOMPRESSPDMasked512 @@ -1474,6 +1484,30 @@ const ( OpAMD64VPBLENDMWMasked512 OpAMD64VPBLENDVB128 OpAMD64VPBLENDVB256 + OpAMD64VPBROADCASTB128 + OpAMD64VPBROADCASTB256 + OpAMD64VPBROADCASTB512 + OpAMD64VPBROADCASTBMasked128 + OpAMD64VPBROADCASTBMasked256 + OpAMD64VPBROADCASTBMasked512 + OpAMD64VPBROADCASTD128 + OpAMD64VPBROADCASTD256 + OpAMD64VPBROADCASTD512 + OpAMD64VPBROADCASTDMasked128 + OpAMD64VPBROADCASTDMasked256 + OpAMD64VPBROADCASTDMasked512 + OpAMD64VPBROADCASTQ128 + OpAMD64VPBROADCASTQ256 + OpAMD64VPBROADCASTQ512 + OpAMD64VPBROADCASTQMasked128 + OpAMD64VPBROADCASTQMasked256 + OpAMD64VPBROADCASTQMasked512 + OpAMD64VPBROADCASTW128 + OpAMD64VPBROADCASTW256 + OpAMD64VPBROADCASTW512 + OpAMD64VPBROADCASTWMasked128 + OpAMD64VPBROADCASTWMasked256 + OpAMD64VPBROADCASTWMasked512 OpAMD64VPCMPEQB128 OpAMD64VPCMPEQB256 OpAMD64VPCMPEQB512 @@ -2222,10 +2256,10 @@ const ( OpAMD64VPRORQMasked128 OpAMD64VPRORQMasked256 OpAMD64VPRORQMasked512 - OpAMD64VPINSRB128 - OpAMD64VPINSRW128 OpAMD64VPINSRD128 OpAMD64VPINSRQ128 + OpAMD64VPINSRB128 + OpAMD64VPINSRW128 OpAMD64VINSERTF128256 OpAMD64VINSERTF64X4512 OpAMD64VINSERTI128256 @@ -4839,6 +4873,66 @@ const ( OpAverageUint16x8 OpAverageUint16x16 OpAverageUint16x32 + OpBroadcast128Float32x4 + OpBroadcast128Float64x2 + OpBroadcast128Int8x16 + OpBroadcast128Int16x8 + OpBroadcast128Int32x4 + OpBroadcast128Int64x2 + OpBroadcast128MaskedFloat32x4 + OpBroadcast128MaskedFloat64x2 + OpBroadcast128MaskedInt8x16 + OpBroadcast128MaskedInt16x8 + OpBroadcast128MaskedInt32x4 + OpBroadcast128MaskedInt64x2 + OpBroadcast128MaskedUint8x16 + OpBroadcast128MaskedUint16x8 + OpBroadcast128MaskedUint32x4 + OpBroadcast128MaskedUint64x2 + OpBroadcast128Uint8x16 + OpBroadcast128Uint16x8 + OpBroadcast128Uint32x4 + OpBroadcast128Uint64x2 + OpBroadcast256Float32x4 + OpBroadcast256Float64x2 + OpBroadcast256Int8x16 + OpBroadcast256Int16x8 + OpBroadcast256Int32x4 + OpBroadcast256Int64x2 + OpBroadcast256MaskedFloat32x4 + OpBroadcast256MaskedFloat64x2 + OpBroadcast256MaskedInt8x16 + OpBroadcast256MaskedInt16x8 + OpBroadcast256MaskedInt32x4 + OpBroadcast256MaskedInt64x2 + OpBroadcast256MaskedUint8x16 + OpBroadcast256MaskedUint16x8 + OpBroadcast256MaskedUint32x4 + OpBroadcast256MaskedUint64x2 + OpBroadcast256Uint8x16 + OpBroadcast256Uint16x8 + OpBroadcast256Uint32x4 + OpBroadcast256Uint64x2 + OpBroadcast512Float32x4 + OpBroadcast512Float64x2 + OpBroadcast512Int8x16 + OpBroadcast512Int16x8 + OpBroadcast512Int32x4 + OpBroadcast512Int64x2 + OpBroadcast512MaskedFloat32x4 + OpBroadcast512MaskedFloat64x2 + OpBroadcast512MaskedInt8x16 + OpBroadcast512MaskedInt16x8 + OpBroadcast512MaskedInt32x4 + OpBroadcast512MaskedInt64x2 + OpBroadcast512MaskedUint8x16 + OpBroadcast512MaskedUint16x8 + OpBroadcast512MaskedUint32x4 + OpBroadcast512MaskedUint64x2 + OpBroadcast512Uint8x16 + OpBroadcast512Uint16x8 + OpBroadcast512Uint32x4 + OpBroadcast512Uint64x2 OpCeilFloat32x4 OpCeilFloat32x8 OpCeilFloat64x2 @@ -6419,6 +6513,8 @@ const ( OpRoundToEvenScaledResidueMaskedFloat64x2 OpRoundToEvenScaledResidueMaskedFloat64x4 OpRoundToEvenScaledResidueMaskedFloat64x8 + OpSetElemFloat32x4 + OpSetElemFloat64x2 OpSetElemInt8x16 OpSetElemInt16x8 OpSetElemInt32x4 @@ -19771,6 +19867,141 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VBROADCASTSD256", + argLen: 1, + asm: x86.AVBROADCASTSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSD512", + argLen: 1, + asm: x86.AVBROADCASTSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VBROADCASTSDMasked256", + argLen: 2, + asm: x86.AVBROADCASTSD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSDMasked512", + argLen: 2, + asm: x86.AVBROADCASTSD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSS128", + argLen: 1, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSS256", + argLen: 1, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSS512", + argLen: 1, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VBROADCASTSSMasked128", + argLen: 2, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSSMasked256", + argLen: 2, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VBROADCASTSSMasked512", + argLen: 2, + asm: x86.AVBROADCASTSS, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VCOMPRESSPDMasked128", argLen: 2, @@ -23272,6 +23503,330 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPBROADCASTB128", + argLen: 1, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTB256", + argLen: 1, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTB512", + argLen: 1, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPBROADCASTBMasked128", + argLen: 2, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTBMasked256", + argLen: 2, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTBMasked512", + argLen: 2, + asm: x86.AVPBROADCASTB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTD128", + argLen: 1, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTD256", + argLen: 1, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTD512", + argLen: 1, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPBROADCASTDMasked128", + argLen: 2, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTDMasked256", + argLen: 2, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTDMasked512", + argLen: 2, + asm: x86.AVPBROADCASTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTQ128", + argLen: 1, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTQ256", + argLen: 1, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTQ512", + argLen: 1, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPBROADCASTQMasked128", + argLen: 2, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTQMasked256", + argLen: 2, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTQMasked512", + argLen: 2, + asm: x86.AVPBROADCASTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTW128", + argLen: 1, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTW256", + argLen: 1, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTW512", + argLen: 1, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPBROADCASTWMasked128", + argLen: 2, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTWMasked256", + argLen: 2, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPBROADCASTWMasked512", + argLen: 2, + asm: x86.AVPBROADCASTW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPCMPEQB128", argLen: 2, @@ -34482,10 +35037,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPINSRB128", + name: "VPINSRD128", auxType: auxUInt8, argLen: 2, - asm: x86.AVPINSRB, + asm: x86.AVPINSRD, reg: regInfo{ inputs: []inputInfo{ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 @@ -34497,10 +35052,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPINSRW128", + name: "VPINSRQ128", auxType: auxUInt8, argLen: 2, - asm: x86.AVPINSRW, + asm: x86.AVPINSRQ, reg: regInfo{ inputs: []inputInfo{ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 @@ -34512,10 +35067,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPINSRD128", + name: "VPINSRB128", auxType: auxUInt8, argLen: 2, - asm: x86.AVPINSRD, + asm: x86.AVPINSRB, reg: regInfo{ inputs: []inputInfo{ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 @@ -34527,10 +35082,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPINSRQ128", + name: "VPINSRW128", auxType: auxUInt8, argLen: 2, - asm: x86.AVPINSRQ, + asm: x86.AVPINSRW, reg: regInfo{ inputs: []inputInfo{ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 @@ -64725,6 +65280,306 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "Broadcast128Float32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Float64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Int8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Int16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Int32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Int64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128MaskedFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedInt8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedInt32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedInt64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedUint16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedUint32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128MaskedUint64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast128Uint8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Uint16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Uint32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast128Uint64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Float32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Float64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Int8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Int16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Int32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Int64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256MaskedFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedInt8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedInt32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedInt64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedUint16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedUint32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256MaskedUint64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast256Uint8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Uint16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Uint32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast256Uint64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Float32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Float64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Int8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Int16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Int32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Int64x2", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512MaskedFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedInt8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedInt32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedInt64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedUint16x8", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedUint32x4", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512MaskedUint64x2", + argLen: 2, + generic: true, + }, + { + name: "Broadcast512Uint8x16", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Uint16x8", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Uint32x4", + argLen: 1, + generic: true, + }, + { + name: "Broadcast512Uint64x2", + argLen: 1, + generic: true, + }, { name: "CeilFloat32x4", argLen: 1, @@ -73153,6 +74008,18 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "SetElemFloat32x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "SetElemFloat64x2", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "SetElemInt8x16", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index c5367adefe..0bdc0e63b7 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1317,6 +1317,156 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpBitLen64(v) case OpBitLen8: return rewriteValueAMD64_OpBitLen8(v) + case OpBroadcast128Float32x4: + v.Op = OpAMD64VBROADCASTSS128 + return true + case OpBroadcast128Float64x2: + v.Op = OpAMD64VPBROADCASTQ128 + return true + case OpBroadcast128Int16x8: + v.Op = OpAMD64VPBROADCASTW128 + return true + case OpBroadcast128Int32x4: + v.Op = OpAMD64VPBROADCASTD128 + return true + case OpBroadcast128Int64x2: + v.Op = OpAMD64VPBROADCASTQ128 + return true + case OpBroadcast128Int8x16: + v.Op = OpAMD64VPBROADCASTB128 + return true + case OpBroadcast128MaskedFloat32x4: + return rewriteValueAMD64_OpBroadcast128MaskedFloat32x4(v) + case OpBroadcast128MaskedFloat64x2: + return rewriteValueAMD64_OpBroadcast128MaskedFloat64x2(v) + case OpBroadcast128MaskedInt16x8: + return rewriteValueAMD64_OpBroadcast128MaskedInt16x8(v) + case OpBroadcast128MaskedInt32x4: + return rewriteValueAMD64_OpBroadcast128MaskedInt32x4(v) + case OpBroadcast128MaskedInt64x2: + return rewriteValueAMD64_OpBroadcast128MaskedInt64x2(v) + case OpBroadcast128MaskedInt8x16: + return rewriteValueAMD64_OpBroadcast128MaskedInt8x16(v) + case OpBroadcast128MaskedUint16x8: + return rewriteValueAMD64_OpBroadcast128MaskedUint16x8(v) + case OpBroadcast128MaskedUint32x4: + return rewriteValueAMD64_OpBroadcast128MaskedUint32x4(v) + case OpBroadcast128MaskedUint64x2: + return rewriteValueAMD64_OpBroadcast128MaskedUint64x2(v) + case OpBroadcast128MaskedUint8x16: + return rewriteValueAMD64_OpBroadcast128MaskedUint8x16(v) + case OpBroadcast128Uint16x8: + v.Op = OpAMD64VPBROADCASTW128 + return true + case OpBroadcast128Uint32x4: + v.Op = OpAMD64VPBROADCASTD128 + return true + case OpBroadcast128Uint64x2: + v.Op = OpAMD64VPBROADCASTQ128 + return true + case OpBroadcast128Uint8x16: + v.Op = OpAMD64VPBROADCASTB128 + return true + case OpBroadcast256Float32x4: + v.Op = OpAMD64VBROADCASTSS256 + return true + case OpBroadcast256Float64x2: + v.Op = OpAMD64VBROADCASTSD256 + return true + case OpBroadcast256Int16x8: + v.Op = OpAMD64VPBROADCASTW256 + return true + case OpBroadcast256Int32x4: + v.Op = OpAMD64VPBROADCASTD256 + return true + case OpBroadcast256Int64x2: + v.Op = OpAMD64VPBROADCASTQ256 + return true + case OpBroadcast256Int8x16: + v.Op = OpAMD64VPBROADCASTB256 + return true + case OpBroadcast256MaskedFloat32x4: + return rewriteValueAMD64_OpBroadcast256MaskedFloat32x4(v) + case OpBroadcast256MaskedFloat64x2: + return rewriteValueAMD64_OpBroadcast256MaskedFloat64x2(v) + case OpBroadcast256MaskedInt16x8: + return rewriteValueAMD64_OpBroadcast256MaskedInt16x8(v) + case OpBroadcast256MaskedInt32x4: + return rewriteValueAMD64_OpBroadcast256MaskedInt32x4(v) + case OpBroadcast256MaskedInt64x2: + return rewriteValueAMD64_OpBroadcast256MaskedInt64x2(v) + case OpBroadcast256MaskedInt8x16: + return rewriteValueAMD64_OpBroadcast256MaskedInt8x16(v) + case OpBroadcast256MaskedUint16x8: + return rewriteValueAMD64_OpBroadcast256MaskedUint16x8(v) + case OpBroadcast256MaskedUint32x4: + return rewriteValueAMD64_OpBroadcast256MaskedUint32x4(v) + case OpBroadcast256MaskedUint64x2: + return rewriteValueAMD64_OpBroadcast256MaskedUint64x2(v) + case OpBroadcast256MaskedUint8x16: + return rewriteValueAMD64_OpBroadcast256MaskedUint8x16(v) + case OpBroadcast256Uint16x8: + v.Op = OpAMD64VPBROADCASTW256 + return true + case OpBroadcast256Uint32x4: + v.Op = OpAMD64VPBROADCASTD256 + return true + case OpBroadcast256Uint64x2: + v.Op = OpAMD64VPBROADCASTQ256 + return true + case OpBroadcast256Uint8x16: + v.Op = OpAMD64VPBROADCASTB256 + return true + case OpBroadcast512Float32x4: + v.Op = OpAMD64VBROADCASTSS512 + return true + case OpBroadcast512Float64x2: + v.Op = OpAMD64VBROADCASTSD512 + return true + case OpBroadcast512Int16x8: + v.Op = OpAMD64VPBROADCASTW512 + return true + case OpBroadcast512Int32x4: + v.Op = OpAMD64VPBROADCASTD512 + return true + case OpBroadcast512Int64x2: + v.Op = OpAMD64VPBROADCASTQ512 + return true + case OpBroadcast512Int8x16: + v.Op = OpAMD64VPBROADCASTB512 + return true + case OpBroadcast512MaskedFloat32x4: + return rewriteValueAMD64_OpBroadcast512MaskedFloat32x4(v) + case OpBroadcast512MaskedFloat64x2: + return rewriteValueAMD64_OpBroadcast512MaskedFloat64x2(v) + case OpBroadcast512MaskedInt16x8: + return rewriteValueAMD64_OpBroadcast512MaskedInt16x8(v) + case OpBroadcast512MaskedInt32x4: + return rewriteValueAMD64_OpBroadcast512MaskedInt32x4(v) + case OpBroadcast512MaskedInt64x2: + return rewriteValueAMD64_OpBroadcast512MaskedInt64x2(v) + case OpBroadcast512MaskedInt8x16: + return rewriteValueAMD64_OpBroadcast512MaskedInt8x16(v) + case OpBroadcast512MaskedUint16x8: + return rewriteValueAMD64_OpBroadcast512MaskedUint16x8(v) + case OpBroadcast512MaskedUint32x4: + return rewriteValueAMD64_OpBroadcast512MaskedUint32x4(v) + case OpBroadcast512MaskedUint64x2: + return rewriteValueAMD64_OpBroadcast512MaskedUint64x2(v) + case OpBroadcast512MaskedUint8x16: + return rewriteValueAMD64_OpBroadcast512MaskedUint8x16(v) + case OpBroadcast512Uint16x8: + v.Op = OpAMD64VPBROADCASTW512 + return true + case OpBroadcast512Uint32x4: + v.Op = OpAMD64VPBROADCASTD512 + return true + case OpBroadcast512Uint64x2: + v.Op = OpAMD64VPBROADCASTQ512 + return true + case OpBroadcast512Uint8x16: + v.Op = OpAMD64VPBROADCASTB512 + return true case OpBswap16: return rewriteValueAMD64_OpBswap16(v) case OpBswap32: @@ -4539,6 +4689,12 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSelect1(v) case OpSelectN: return rewriteValueAMD64_OpSelectN(v) + case OpSetElemFloat32x4: + v.Op = OpAMD64VPINSRD128 + return true + case OpSetElemFloat64x2: + v.Op = OpAMD64VPINSRQ128 + return true case OpSetElemInt16x8: v.Op = OpAMD64VPINSRW128 return true @@ -31628,6 +31784,486 @@ func rewriteValueAMD64_OpBitLen8(v *Value) bool { } return false } +func rewriteValueAMD64_OpBroadcast128MaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedFloat32x4 x mask) + // result: (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VBROADCASTSSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedFloat64x2 x mask) + // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedInt16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedInt16x8 x mask) + // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedInt32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedInt32x4 x mask) + // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedInt64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedInt64x2 x mask) + // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedInt8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedInt8x16 x mask) + // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedUint16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedUint16x8 x mask) + // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedUint32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedUint32x4 x mask) + // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedUint64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedUint64x2 x mask) + // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast128MaskedUint8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast128MaskedUint8x16 x mask) + // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedFloat32x4 x mask) + // result: (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VBROADCASTSSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedFloat64x2 x mask) + // result: (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VBROADCASTSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedInt16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedInt16x8 x mask) + // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedInt32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedInt32x4 x mask) + // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedInt64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedInt64x2 x mask) + // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedInt8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedInt8x16 x mask) + // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedUint16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedUint16x8 x mask) + // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedUint32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedUint32x4 x mask) + // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedUint64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedUint64x2 x mask) + // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast256MaskedUint8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast256MaskedUint8x16 x mask) + // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedFloat32x4 x mask) + // result: (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VBROADCASTSSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedFloat64x2 x mask) + // result: (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VBROADCASTSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedInt16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedInt16x8 x mask) + // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedInt32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedInt32x4 x mask) + // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedInt64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedInt64x2 x mask) + // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedInt8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedInt8x16 x mask) + // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedUint16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedUint16x8 x mask) + // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedUint32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedUint32x4 x mask) + // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedUint64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedUint64x2 x mask) + // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpBroadcast512MaskedUint8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (Broadcast512MaskedUint8x16 x mask) + // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpBswap16(v *Value) bool { v_0 := v.Args[0] // match: (Bswap16 x) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index e14e02a71e..7a95a4450d 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -240,6 +240,66 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedFloat32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedFloat64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedFloat32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedFloat64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint64x2, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64) @@ -1408,6 +1468,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int16x8.SetElem", opLen2Imm8(ssa.OpSetElemInt16x8, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x4.SetElem", opLen2Imm8(ssa.OpSetElemInt32x4, types.TypeVec128, 0), sys.AMD64) diff --git a/src/simd/genfiles.go b/src/simd/genfiles.go index c7c6aae374..8b36da71ab 100644 --- a/src/simd/genfiles.go +++ b/src/simd/genfiles.go @@ -87,6 +87,23 @@ var ternaryFlaky = &shapes{ // for tests that support flaky equality floats: []int{32}, } +type templateData struct { + Vec string // the type of the vector, e.g. Float32x4 + AOrAn string // for documentation, the article "a" or "an" + Width int // the bit width of the element type, e.g. 32 + Vwidth int // the width of the vector type, e.g. 128 + Count int // the number of elements, e.g. 4 + WxC string // the width-by-type string, e.g., "32x4" + BxC string // as if bytes, in the proper count, e.g., "8x16" (W==8) + Base string // the capitalized Base Type of the vector, e.g., "Float" + Type string // the element type, e.g. "float32" + OxFF string // a mask for the lowest 'count' bits +} + +func (t templateData) As128BitVec() string { + return fmt.Sprintf("%s%dx%d", t.Base, t.Width, 128/t.Width) +} + func oneTemplate(t *template.Template, baseType string, width, count int, out io.Writer) { b := width * count if b < 128 || b > 512 { @@ -102,26 +119,17 @@ func oneTemplate(t *template.Template, baseType string, width, count int, out io aOrAn = "an" } oxFF := fmt.Sprintf("0x%x", uint64((1<