From 8ac4477d83672af8c3d39399685731ee6b81ce2f Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Thu, 8 Jan 2026 11:57:28 -0500 Subject: [PATCH] simd/archsimd: rename Broadcast methods Currently the Broadcast128/256/512 methods broadcast the lowest element of the input vector to a vector of the corresponding width. There are also variations of broadcast operations that broadcast the whole (128- or 256-bit) vector to a larger vector, which we don't yet support. Our current naming is unclear which version it is, though. Rename the current ones to Broadcast1ToN, to be clear that they broadcast one element. The vector version probably will be named BoradcastAllToN (not included in this CL). Change-Id: I47a21e367f948ec0b578d63706a40d20f5a9f46d Reviewed-on: https://go-review.googlesource.com/c/go/+/734840 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/amd64/simdssa.go | 88 +++---- .../compile/internal/ssa/_gen/simdAMD64.rules | 82 +++--- .../internal/ssa/_gen/simdgenericOps.go | 60 ++--- src/cmd/compile/internal/ssa/opGen.go | 120 ++++----- src/cmd/compile/internal/ssa/rewriteAMD64.go | 112 ++++---- .../compile/internal/ssagen/simdintrinsics.go | 60 ++--- .../_gen/simdgen/ops/Moves/categories.yaml | 33 ++- .../archsimd/_gen/simdgen/ops/Moves/go.yaml | 79 ++++-- src/simd/archsimd/_gen/tmplgen/main.go | 2 +- src/simd/archsimd/ops_amd64.go | 240 +++++++++--------- src/simd/archsimd/other_gen_amd64.go | 60 ++--- 11 files changed, 490 insertions(+), 446 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index c4d0fd69c6..a028cbe86d 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -25,23 +25,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ512, - ssa.OpAMD64VBROADCASTSS128, ssa.OpAMD64VPBROADCASTQ128, - ssa.OpAMD64VPBROADCASTB128, - ssa.OpAMD64VPBROADCASTW128, + ssa.OpAMD64VBROADCASTSS128, + ssa.OpAMD64VBROADCASTSD256, ssa.OpAMD64VPBROADCASTD128, + ssa.OpAMD64VPBROADCASTQ256, ssa.OpAMD64VBROADCASTSS256, - ssa.OpAMD64VBROADCASTSD256, - ssa.OpAMD64VPBROADCASTB256, - ssa.OpAMD64VPBROADCASTW256, + ssa.OpAMD64VBROADCASTSD512, + ssa.OpAMD64VPBROADCASTW128, ssa.OpAMD64VPBROADCASTD256, - ssa.OpAMD64VPBROADCASTQ256, + ssa.OpAMD64VPBROADCASTQ512, ssa.OpAMD64VBROADCASTSS512, - ssa.OpAMD64VBROADCASTSD512, - ssa.OpAMD64VPBROADCASTB512, - ssa.OpAMD64VPBROADCASTW512, + ssa.OpAMD64VPBROADCASTB128, + ssa.OpAMD64VPBROADCASTW256, ssa.OpAMD64VPBROADCASTD512, - ssa.OpAMD64VPBROADCASTQ512, + ssa.OpAMD64VPBROADCASTB256, + ssa.OpAMD64VPBROADCASTW512, + ssa.OpAMD64VPBROADCASTB512, ssa.OpAMD64VCVTPD2PSX128, ssa.OpAMD64VCVTPD2PSY128, ssa.OpAMD64VCVTPD2PS256, @@ -832,23 +832,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128, ssa.OpAMD64VPABSQMasked256, ssa.OpAMD64VPABSQMasked512, - ssa.OpAMD64VBROADCASTSSMasked128, ssa.OpAMD64VPBROADCASTQMasked128, - ssa.OpAMD64VPBROADCASTBMasked128, - ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VBROADCASTSDMasked256, ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VPBROADCASTQMasked256, ssa.OpAMD64VBROADCASTSSMasked256, - ssa.OpAMD64VBROADCASTSDMasked256, - ssa.OpAMD64VPBROADCASTBMasked256, - ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTWMasked128, ssa.OpAMD64VPBROADCASTDMasked256, - ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VBROADCASTSSMasked512, - ssa.OpAMD64VBROADCASTSDMasked512, - ssa.OpAMD64VPBROADCASTBMasked512, - ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked256, ssa.OpAMD64VPBROADCASTDMasked512, - ssa.OpAMD64VPBROADCASTQMasked512, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, ssa.OpAMD64VCOMPRESSPSMasked128, ssa.OpAMD64VCOMPRESSPSMasked256, ssa.OpAMD64VCOMPRESSPSMasked512, @@ -2460,23 +2460,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128Merging, ssa.OpAMD64VPABSQMasked256Merging, ssa.OpAMD64VPABSQMasked512Merging, - ssa.OpAMD64VBROADCASTSSMasked128Merging, ssa.OpAMD64VPBROADCASTQMasked128Merging, - ssa.OpAMD64VPBROADCASTBMasked128Merging, - ssa.OpAMD64VPBROADCASTWMasked128Merging, + ssa.OpAMD64VBROADCASTSSMasked128Merging, + ssa.OpAMD64VBROADCASTSDMasked256Merging, ssa.OpAMD64VPBROADCASTDMasked128Merging, + ssa.OpAMD64VPBROADCASTQMasked256Merging, ssa.OpAMD64VBROADCASTSSMasked256Merging, - ssa.OpAMD64VBROADCASTSDMasked256Merging, - ssa.OpAMD64VPBROADCASTBMasked256Merging, - ssa.OpAMD64VPBROADCASTWMasked256Merging, + ssa.OpAMD64VBROADCASTSDMasked512Merging, + ssa.OpAMD64VPBROADCASTWMasked128Merging, ssa.OpAMD64VPBROADCASTDMasked256Merging, - ssa.OpAMD64VPBROADCASTQMasked256Merging, + ssa.OpAMD64VPBROADCASTQMasked512Merging, ssa.OpAMD64VBROADCASTSSMasked512Merging, - ssa.OpAMD64VBROADCASTSDMasked512Merging, - ssa.OpAMD64VPBROADCASTBMasked512Merging, - ssa.OpAMD64VPBROADCASTWMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked128Merging, + ssa.OpAMD64VPBROADCASTWMasked256Merging, ssa.OpAMD64VPBROADCASTDMasked512Merging, - ssa.OpAMD64VPBROADCASTQMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked256Merging, + ssa.OpAMD64VPBROADCASTWMasked512Merging, + ssa.OpAMD64VPBROADCASTBMasked512Merging, ssa.OpAMD64VRNDSCALEPSMasked128Merging, ssa.OpAMD64VRNDSCALEPSMasked256Merging, ssa.OpAMD64VRNDSCALEPSMasked512Merging, @@ -2817,23 +2817,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128, ssa.OpAMD64VPAVGWMasked256, ssa.OpAMD64VPAVGWMasked512, - ssa.OpAMD64VBROADCASTSSMasked128, ssa.OpAMD64VPBROADCASTQMasked128, - ssa.OpAMD64VPBROADCASTBMasked128, - ssa.OpAMD64VPBROADCASTWMasked128, + ssa.OpAMD64VBROADCASTSSMasked128, + ssa.OpAMD64VBROADCASTSDMasked256, ssa.OpAMD64VPBROADCASTDMasked128, + ssa.OpAMD64VPBROADCASTQMasked256, ssa.OpAMD64VBROADCASTSSMasked256, - ssa.OpAMD64VBROADCASTSDMasked256, - ssa.OpAMD64VPBROADCASTBMasked256, - ssa.OpAMD64VPBROADCASTWMasked256, + ssa.OpAMD64VBROADCASTSDMasked512, + ssa.OpAMD64VPBROADCASTWMasked128, ssa.OpAMD64VPBROADCASTDMasked256, - ssa.OpAMD64VPBROADCASTQMasked256, + ssa.OpAMD64VPBROADCASTQMasked512, ssa.OpAMD64VBROADCASTSSMasked512, - ssa.OpAMD64VBROADCASTSDMasked512, - ssa.OpAMD64VPBROADCASTBMasked512, - ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked128, + ssa.OpAMD64VPBROADCASTWMasked256, ssa.OpAMD64VPBROADCASTDMasked512, - ssa.OpAMD64VPBROADCASTQMasked512, + ssa.OpAMD64VPBROADCASTBMasked256, + ssa.OpAMD64VPBROADCASTWMasked512, + ssa.OpAMD64VPBROADCASTBMasked512, ssa.OpAMD64VRNDSCALEPSMasked128, ssa.OpAMD64VRNDSCALEPSMasked128load, ssa.OpAMD64VRNDSCALEPSMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 5c83f39a1f..799461610d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -140,36 +140,36 @@ (AverageUint16x8 ...) => (VPAVGW128 ...) (AverageUint16x16 ...) => (VPAVGW256 ...) (AverageUint16x32 ...) => (VPAVGW512 ...) -(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...) -(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...) -(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...) -(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...) -(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...) -(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...) -(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...) -(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...) -(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...) -(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...) -(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...) -(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...) -(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...) -(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...) -(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...) -(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...) -(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...) -(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...) -(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...) -(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...) -(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...) -(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...) -(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...) -(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...) -(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...) -(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...) -(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...) -(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...) +(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...) +(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...) +(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...) +(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...) +(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...) +(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...) +(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...) +(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...) +(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...) +(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...) +(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...) +(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...) +(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...) +(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...) +(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...) +(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...) +(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...) (CeilFloat32x4 x) => (VROUNDPS128 [2] x) (CeilFloat32x8 x) => (VROUNDPS256 [2] x) (CeilFloat64x2 x) => (VROUNDPD128 [2] x) @@ -1424,23 +1424,23 @@ (VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask) (VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask) (VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask) -(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask) (VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask) -(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask) -(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask) +(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask) +(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask) (VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask) +(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask) (VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask) -(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask) -(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask) -(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask) +(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask) +(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask) (VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask) -(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask) +(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask) (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask) -(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask) -(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask) -(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask) +(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask) +(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask) (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask) -(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask) +(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask) +(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask) +(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask) (VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask) (VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask) (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 889ab0d84f..ff863a389f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -143,36 +143,36 @@ func simdGenericOps() []opData { {name: "AverageUint16x8", argLength: 2, commutative: true}, {name: "AverageUint16x16", argLength: 2, commutative: true}, {name: "AverageUint16x32", argLength: 2, commutative: true}, - {name: "Broadcast128Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast128Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast128Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast128Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast128Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast128Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast128Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast128Uint64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast256Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast256Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast256Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast256Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast256Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast256Uint64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Float32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Float64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Int8x16", argLength: 1, commutative: false}, - {name: "Broadcast512Int16x8", argLength: 1, commutative: false}, - {name: "Broadcast512Int32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Int64x2", argLength: 1, commutative: false}, - {name: "Broadcast512Uint8x16", argLength: 1, commutative: false}, - {name: "Broadcast512Uint16x8", argLength: 1, commutative: false}, - {name: "Broadcast512Uint32x4", argLength: 1, commutative: false}, - {name: "Broadcast512Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To2Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To4Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To4Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Float64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Int64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To8Uint64x2", argLength: 1, commutative: false}, + {name: "Broadcast1To16Float32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To16Int32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To16Uint32x4", argLength: 1, commutative: false}, + {name: "Broadcast1To32Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To32Int16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To32Uint8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To32Uint16x8", argLength: 1, commutative: false}, + {name: "Broadcast1To64Int8x16", argLength: 1, commutative: false}, + {name: "Broadcast1To64Uint8x16", argLength: 1, commutative: false}, {name: "CeilFloat32x4", argLength: 1, commutative: false}, {name: "CeilFloat32x8", argLength: 1, commutative: false}, {name: "CeilFloat64x2", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7b70dc2686..9e5fdb1fc1 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -6309,36 +6309,36 @@ const ( OpAverageUint16x8 OpAverageUint16x16 OpAverageUint16x32 - OpBroadcast128Float32x4 - OpBroadcast128Float64x2 - OpBroadcast128Int8x16 - OpBroadcast128Int16x8 - OpBroadcast128Int32x4 - OpBroadcast128Int64x2 - OpBroadcast128Uint8x16 - OpBroadcast128Uint16x8 - OpBroadcast128Uint32x4 - OpBroadcast128Uint64x2 - OpBroadcast256Float32x4 - OpBroadcast256Float64x2 - OpBroadcast256Int8x16 - OpBroadcast256Int16x8 - OpBroadcast256Int32x4 - OpBroadcast256Int64x2 - OpBroadcast256Uint8x16 - OpBroadcast256Uint16x8 - OpBroadcast256Uint32x4 - OpBroadcast256Uint64x2 - OpBroadcast512Float32x4 - OpBroadcast512Float64x2 - OpBroadcast512Int8x16 - OpBroadcast512Int16x8 - OpBroadcast512Int32x4 - OpBroadcast512Int64x2 - OpBroadcast512Uint8x16 - OpBroadcast512Uint16x8 - OpBroadcast512Uint32x4 - OpBroadcast512Uint64x2 + OpBroadcast1To2Float64x2 + OpBroadcast1To2Int64x2 + OpBroadcast1To2Uint64x2 + OpBroadcast1To4Float32x4 + OpBroadcast1To4Float64x2 + OpBroadcast1To4Int32x4 + OpBroadcast1To4Int64x2 + OpBroadcast1To4Uint32x4 + OpBroadcast1To4Uint64x2 + OpBroadcast1To8Float32x4 + OpBroadcast1To8Float64x2 + OpBroadcast1To8Int16x8 + OpBroadcast1To8Int32x4 + OpBroadcast1To8Int64x2 + OpBroadcast1To8Uint16x8 + OpBroadcast1To8Uint32x4 + OpBroadcast1To8Uint64x2 + OpBroadcast1To16Float32x4 + OpBroadcast1To16Int8x16 + OpBroadcast1To16Int16x8 + OpBroadcast1To16Int32x4 + OpBroadcast1To16Uint8x16 + OpBroadcast1To16Uint16x8 + OpBroadcast1To16Uint32x4 + OpBroadcast1To32Int8x16 + OpBroadcast1To32Int16x8 + OpBroadcast1To32Uint8x16 + OpBroadcast1To32Uint16x8 + OpBroadcast1To64Int8x16 + OpBroadcast1To64Uint8x16 OpCeilFloat32x4 OpCeilFloat32x8 OpCeilFloat64x2 @@ -89875,152 +89875,152 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Broadcast128Float32x4", + name: "Broadcast1To2Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Float64x2", + name: "Broadcast1To2Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int8x16", + name: "Broadcast1To2Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int16x8", + name: "Broadcast1To4Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Int32x4", + name: "Broadcast1To4Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Int64x2", + name: "Broadcast1To4Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Uint8x16", + name: "Broadcast1To4Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Uint16x8", + name: "Broadcast1To4Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast128Uint32x4", + name: "Broadcast1To4Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast128Uint64x2", + name: "Broadcast1To8Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Float32x4", + name: "Broadcast1To8Float64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Float64x2", + name: "Broadcast1To8Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast256Int8x16", + name: "Broadcast1To8Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Int16x8", + name: "Broadcast1To8Int64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Int32x4", + name: "Broadcast1To8Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast256Int64x2", + name: "Broadcast1To8Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Uint8x16", + name: "Broadcast1To8Uint64x2", argLen: 1, generic: true, }, { - name: "Broadcast256Uint16x8", + name: "Broadcast1To16Float32x4", argLen: 1, generic: true, }, { - name: "Broadcast256Uint32x4", + name: "Broadcast1To16Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast256Uint64x2", + name: "Broadcast1To16Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Float32x4", + name: "Broadcast1To16Int32x4", argLen: 1, generic: true, }, { - name: "Broadcast512Float64x2", + name: "Broadcast1To16Uint8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Int8x16", + name: "Broadcast1To16Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Int16x8", + name: "Broadcast1To16Uint32x4", argLen: 1, generic: true, }, { - name: "Broadcast512Int32x4", + name: "Broadcast1To32Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Int64x2", + name: "Broadcast1To32Int16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Uint8x16", + name: "Broadcast1To32Uint8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Uint16x8", + name: "Broadcast1To32Uint16x8", argLen: 1, generic: true, }, { - name: "Broadcast512Uint32x4", + name: "Broadcast1To64Int8x16", argLen: 1, generic: true, }, { - name: "Broadcast512Uint64x2", + name: "Broadcast1To64Uint8x16", argLen: 1, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index e84bf19c83..fe0005bb05 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2479,96 +2479,96 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpBitLen64(v) case OpBitLen8: return rewriteValueAMD64_OpBitLen8(v) - case OpBroadcast128Float32x4: - v.Op = OpAMD64VBROADCASTSS128 + case OpBroadcast1To16Float32x4: + v.Op = OpAMD64VBROADCASTSS512 return true - case OpBroadcast128Float64x2: - v.Op = OpAMD64VPBROADCASTQ128 + case OpBroadcast1To16Int16x8: + v.Op = OpAMD64VPBROADCASTW256 return true - case OpBroadcast128Int16x8: - v.Op = OpAMD64VPBROADCASTW128 + case OpBroadcast1To16Int32x4: + v.Op = OpAMD64VPBROADCASTD512 return true - case OpBroadcast128Int32x4: - v.Op = OpAMD64VPBROADCASTD128 + case OpBroadcast1To16Int8x16: + v.Op = OpAMD64VPBROADCASTB128 return true - case OpBroadcast128Int64x2: - v.Op = OpAMD64VPBROADCASTQ128 + case OpBroadcast1To16Uint16x8: + v.Op = OpAMD64VPBROADCASTW256 return true - case OpBroadcast128Int8x16: + case OpBroadcast1To16Uint32x4: + v.Op = OpAMD64VPBROADCASTD512 + return true + case OpBroadcast1To16Uint8x16: v.Op = OpAMD64VPBROADCASTB128 return true - case OpBroadcast128Uint16x8: - v.Op = OpAMD64VPBROADCASTW128 + case OpBroadcast1To2Float64x2: + v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint32x4: - v.Op = OpAMD64VPBROADCASTD128 + case OpBroadcast1To2Int64x2: + v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint64x2: + case OpBroadcast1To2Uint64x2: v.Op = OpAMD64VPBROADCASTQ128 return true - case OpBroadcast128Uint8x16: - v.Op = OpAMD64VPBROADCASTB128 + case OpBroadcast1To32Int16x8: + v.Op = OpAMD64VPBROADCASTW512 return true - case OpBroadcast256Float32x4: - v.Op = OpAMD64VBROADCASTSS256 + case OpBroadcast1To32Int8x16: + v.Op = OpAMD64VPBROADCASTB256 return true - case OpBroadcast256Float64x2: - v.Op = OpAMD64VBROADCASTSD256 + case OpBroadcast1To32Uint16x8: + v.Op = OpAMD64VPBROADCASTW512 return true - case OpBroadcast256Int16x8: - v.Op = OpAMD64VPBROADCASTW256 + case OpBroadcast1To32Uint8x16: + v.Op = OpAMD64VPBROADCASTB256 return true - case OpBroadcast256Int32x4: - v.Op = OpAMD64VPBROADCASTD256 + case OpBroadcast1To4Float32x4: + v.Op = OpAMD64VBROADCASTSS128 return true - case OpBroadcast256Int64x2: - v.Op = OpAMD64VPBROADCASTQ256 + case OpBroadcast1To4Float64x2: + v.Op = OpAMD64VBROADCASTSD256 return true - case OpBroadcast256Int8x16: - v.Op = OpAMD64VPBROADCASTB256 + case OpBroadcast1To4Int32x4: + v.Op = OpAMD64VPBROADCASTD128 return true - case OpBroadcast256Uint16x8: - v.Op = OpAMD64VPBROADCASTW256 + case OpBroadcast1To4Int64x2: + v.Op = OpAMD64VPBROADCASTQ256 return true - case OpBroadcast256Uint32x4: - v.Op = OpAMD64VPBROADCASTD256 + case OpBroadcast1To4Uint32x4: + v.Op = OpAMD64VPBROADCASTD128 return true - case OpBroadcast256Uint64x2: + case OpBroadcast1To4Uint64x2: v.Op = OpAMD64VPBROADCASTQ256 return true - case OpBroadcast256Uint8x16: - v.Op = OpAMD64VPBROADCASTB256 + case OpBroadcast1To64Int8x16: + v.Op = OpAMD64VPBROADCASTB512 return true - case OpBroadcast512Float32x4: - v.Op = OpAMD64VBROADCASTSS512 + case OpBroadcast1To64Uint8x16: + v.Op = OpAMD64VPBROADCASTB512 + return true + case OpBroadcast1To8Float32x4: + v.Op = OpAMD64VBROADCASTSS256 return true - case OpBroadcast512Float64x2: + case OpBroadcast1To8Float64x2: v.Op = OpAMD64VBROADCASTSD512 return true - case OpBroadcast512Int16x8: - v.Op = OpAMD64VPBROADCASTW512 + case OpBroadcast1To8Int16x8: + v.Op = OpAMD64VPBROADCASTW128 return true - case OpBroadcast512Int32x4: - v.Op = OpAMD64VPBROADCASTD512 + case OpBroadcast1To8Int32x4: + v.Op = OpAMD64VPBROADCASTD256 return true - case OpBroadcast512Int64x2: + case OpBroadcast1To8Int64x2: v.Op = OpAMD64VPBROADCASTQ512 return true - case OpBroadcast512Int8x16: - v.Op = OpAMD64VPBROADCASTB512 - return true - case OpBroadcast512Uint16x8: - v.Op = OpAMD64VPBROADCASTW512 + case OpBroadcast1To8Uint16x8: + v.Op = OpAMD64VPBROADCASTW128 return true - case OpBroadcast512Uint32x4: - v.Op = OpAMD64VPBROADCASTD512 + case OpBroadcast1To8Uint32x4: + v.Op = OpAMD64VPBROADCASTD256 return true - case OpBroadcast512Uint64x2: + case OpBroadcast1To8Uint64x2: v.Op = OpAMD64VPBROADCASTQ512 return true - case OpBroadcast512Uint8x16: - v.Op = OpAMD64VPBROADCASTB512 - return true case OpBswap16: return rewriteValueAMD64_OpBswap16(v) case OpBswap32: diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 4ad0c6032c..e50561845b 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -152,36 +152,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64) diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml index 38bc9374cc..3cba01ef95 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml @@ -69,21 +69,36 @@ documentation: !string |- // NAME performs an expansion on a vector x whose elements are packed to lower parts. // The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order. -- go: Broadcast128 +- go: Broadcast1To2 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 128-bit output vector. -- go: Broadcast256 + // NAME copies the lowest element of its input to all 2 elements of + // the output vector. +- go: Broadcast1To4 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 256-bit output vector. -- go: Broadcast512 + // NAME copies the lowest element of its input to all 4 elements of + // the output vector. +- go: Broadcast1To8 commutative: false documentation: !string |- - // NAME copies element zero of its (128-bit) input to all elements of - // the 512-bit output vector. + // NAME copies the lowest element of its input to all 8 elements of + // the output vector. +- go: Broadcast1To16 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 16 elements of + // the output vector. +- go: Broadcast1To32 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 32 elements of + // the output vector. +- go: Broadcast1To64 + commutative: false + documentation: !string |- + // NAME copies the lowest element of its input to all 64 elements of + // the output vector. - go: PermuteOrZeroGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml index e1fd184ed7..02daa2ea1e 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml @@ -376,21 +376,21 @@ out: - *any -- go: Broadcast128 - asm: VPBROADCAST[BWDQ] +- go: Broadcast1To2 + asm: VPBROADCASTQ in: - class: vreg bits: 128 - elemBits: $e + elemBits: 64 base: $b out: - class: vreg bits: 128 - elemBits: $e + elemBits: 64 base: $b # weirdly, this one case on AVX2 is memory-operand-only -- go: Broadcast128 +- go: Broadcast1To2 asm: VPBROADCASTQ in: - class: vreg @@ -405,71 +405,94 @@ base: int OverwriteBase: float -- go: Broadcast256 +- go: Broadcast1To4 asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 256 - elemBits: $e + lanes: 4 base: $b -- go: Broadcast512 +- go: Broadcast1To8 asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 512 - elemBits: $e + lanes: 8 base: $b -- go: Broadcast128 - asm: VBROADCASTS[SD] +- go: Broadcast1To16 + asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 128 - elemBits: $e + lanes: 16 base: $b -- go: Broadcast256 - asm: VBROADCASTS[SD] +- go: Broadcast1To32 + asm: VPBROADCAST[BWDQ] in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 256 - elemBits: $e + lanes: 32 base: $b -- go: Broadcast512 - asm: VBROADCASTS[SD] +- go: Broadcast1To64 + asm: VPBROADCASTB in: - class: vreg bits: 128 - elemBits: $e base: $b out: - class: vreg - bits: 512 - elemBits: $e + lanes: 64 base: $b +- go: Broadcast1To4 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 4 + base: float + +- go: Broadcast1To8 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 8 + base: float + +- go: Broadcast1To16 + asm: VBROADCASTS[SD] + in: + - class: vreg + bits: 128 + base: float + out: + - class: vreg + lanes: 16 + base: float + # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX) - go: PermuteOrZero asm: VPSHUFB diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go index 8db185e1e0..45338b765d 100644 --- a/src/simd/archsimd/_gen/tmplgen/main.go +++ b/src/simd/archsimd/_gen/tmplgen/main.go @@ -873,7 +873,7 @@ var broadcastTemplate = templateOf("Broadcast functions", ` // Emulated, CPU Feature: {{.CPUfeatureBC}} func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} { var z {{.As128BitVec }} - return z.SetElem(0, x).Broadcast{{.Vwidth}}() + return z.SetElem(0, x).Broadcast1To{{.Count}}() } `) diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index eba340c793..bb162c4ff9 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go @@ -805,191 +805,197 @@ func (x Uint16x16) Average(y Uint16x16) Uint16x16 // Asm: VPAVGW, CPU Feature: AVX512 func (x Uint16x32) Average(y Uint16x32) Uint16x32 -/* Broadcast128 */ +/* Broadcast1To2 */ -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // -// Asm: VBROADCASTSS, CPU Feature: AVX2 -func (x Float32x4) Broadcast128() Float32x4 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Float64x2) Broadcast1To2() Float64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // // Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Float64x2) Broadcast128() Float64x2 +func (x Int64x2) Broadcast1To2() Int64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To2 copies the lowest element of its input to all 2 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Int8x16) Broadcast128() Int8x16 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Uint64x2) Broadcast1To2() Uint64x2 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. -// -// Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Int16x8) Broadcast128() Int16x8 +/* Broadcast1To4 */ -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Int32x4) Broadcast128() Int32x4 +// Asm: VBROADCASTSS, CPU Feature: AVX2 +func (x Float32x4) Broadcast1To4() Float32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Int64x2) Broadcast128() Int64x2 +// Asm: VBROADCASTSD, CPU Feature: AVX2 +func (x Float64x2) Broadcast1To4() Float64x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Uint8x16) Broadcast128() Uint8x16 +// Asm: VPBROADCASTD, CPU Feature: AVX2 +func (x Int32x4) Broadcast1To4() Int32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // -// Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Uint16x8) Broadcast128() Uint16x8 +// Asm: VPBROADCASTQ, CPU Feature: AVX2 +func (x Int64x2) Broadcast1To4() Int64x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Uint32x4) Broadcast128() Uint32x4 +func (x Uint32x4) Broadcast1To4() Uint32x4 -// Broadcast128 copies element zero of its (128-bit) input to all elements of -// the 128-bit output vector. +// Broadcast1To4 copies the lowest element of its input to all 4 elements of +// the output vector. // // Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Uint64x2) Broadcast128() Uint64x2 +func (x Uint64x2) Broadcast1To4() Uint64x4 -/* Broadcast256 */ +/* Broadcast1To8 */ -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VBROADCASTSS, CPU Feature: AVX2 -func (x Float32x4) Broadcast256() Float32x8 - -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. -// -// Asm: VBROADCASTSD, CPU Feature: AVX2 -func (x Float64x2) Broadcast256() Float64x4 +func (x Float32x4) Broadcast1To8() Float32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Int8x16) Broadcast256() Int8x32 +// Asm: VBROADCASTSD, CPU Feature: AVX512 +func (x Float64x2) Broadcast1To8() Float64x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Int16x8) Broadcast256() Int16x16 +func (x Int16x8) Broadcast1To8() Int16x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Int32x4) Broadcast256() Int32x8 +func (x Int32x4) Broadcast1To8() Int32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Int64x2) Broadcast256() Int64x4 - -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. -// -// Asm: VPBROADCASTB, CPU Feature: AVX2 -func (x Uint8x16) Broadcast256() Uint8x32 +// Asm: VPBROADCASTQ, CPU Feature: AVX512 +func (x Int64x2) Broadcast1To8() Int64x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX2 -func (x Uint16x8) Broadcast256() Uint16x16 +func (x Uint16x8) Broadcast1To8() Uint16x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX2 -func (x Uint32x4) Broadcast256() Uint32x8 +func (x Uint32x4) Broadcast1To8() Uint32x8 -// Broadcast256 copies element zero of its (128-bit) input to all elements of -// the 256-bit output vector. +// Broadcast1To8 copies the lowest element of its input to all 8 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX2 -func (x Uint64x2) Broadcast256() Uint64x4 +// Asm: VPBROADCASTQ, CPU Feature: AVX512 +func (x Uint64x2) Broadcast1To8() Uint64x8 -/* Broadcast512 */ +/* Broadcast1To16 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // // Asm: VBROADCASTSS, CPU Feature: AVX512 -func (x Float32x4) Broadcast512() Float32x16 +func (x Float32x4) Broadcast1To16() Float32x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VBROADCASTSD, CPU Feature: AVX512 -func (x Float64x2) Broadcast512() Float64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Int8x16) Broadcast1To16() Int8x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX512 -func (x Int8x16) Broadcast512() Int8x64 +// Asm: VPBROADCASTW, CPU Feature: AVX2 +func (x Int16x8) Broadcast1To16() Int16x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // -// Asm: VPBROADCASTW, CPU Feature: AVX512 -func (x Int16x8) Broadcast512() Int16x32 +// Asm: VPBROADCASTD, CPU Feature: AVX512 +func (x Int32x4) Broadcast1To16() Int32x16 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. +// +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Uint8x16) Broadcast1To16() Uint8x16 + +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. +// +// Asm: VPBROADCASTW, CPU Feature: AVX2 +func (x Uint16x8) Broadcast1To16() Uint16x16 + +// Broadcast1To16 copies the lowest element of its input to all 16 elements of +// the output vector. // // Asm: VPBROADCASTD, CPU Feature: AVX512 -func (x Int32x4) Broadcast512() Int32x16 +func (x Uint32x4) Broadcast1To16() Uint32x16 + +/* Broadcast1To32 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX512 -func (x Int64x2) Broadcast512() Int64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Int8x16) Broadcast1To32() Int8x32 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // -// Asm: VPBROADCASTB, CPU Feature: AVX512 -func (x Uint8x16) Broadcast512() Uint8x64 +// Asm: VPBROADCASTW, CPU Feature: AVX512 +func (x Int16x8) Broadcast1To32() Int16x32 + +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. +// +// Asm: VPBROADCASTB, CPU Feature: AVX2 +func (x Uint8x16) Broadcast1To32() Uint8x32 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To32 copies the lowest element of its input to all 32 elements of +// the output vector. // // Asm: VPBROADCASTW, CPU Feature: AVX512 -func (x Uint16x8) Broadcast512() Uint16x32 +func (x Uint16x8) Broadcast1To32() Uint16x32 + +/* Broadcast1To64 */ -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To64 copies the lowest element of its input to all 64 elements of +// the output vector. // -// Asm: VPBROADCASTD, CPU Feature: AVX512 -func (x Uint32x4) Broadcast512() Uint32x16 +// Asm: VPBROADCASTB, CPU Feature: AVX512 +func (x Int8x16) Broadcast1To64() Int8x64 -// Broadcast512 copies element zero of its (128-bit) input to all elements of -// the 512-bit output vector. +// Broadcast1To64 copies the lowest element of its input to all 64 elements of +// the output vector. // -// Asm: VPBROADCASTQ, CPU Feature: AVX512 -func (x Uint64x2) Broadcast512() Uint64x8 +// Asm: VPBROADCASTB, CPU Feature: AVX512 +func (x Uint8x16) Broadcast1To64() Uint8x64 /* Ceil */ diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go index 647001acce..c250dc2436 100644 --- a/src/simd/archsimd/other_gen_amd64.go +++ b/src/simd/archsimd/other_gen_amd64.go @@ -10,7 +10,7 @@ package archsimd // Emulated, CPU Feature: AVX2 func BroadcastInt8x16(x int8) Int8x16 { var z Int8x16 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt16x8 returns a vector with the input @@ -19,7 +19,7 @@ func BroadcastInt8x16(x int8) Int8x16 { // Emulated, CPU Feature: AVX2 func BroadcastInt16x8(x int16) Int16x8 { var z Int16x8 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastInt32x4 returns a vector with the input @@ -28,7 +28,7 @@ func BroadcastInt16x8(x int16) Int16x8 { // Emulated, CPU Feature: AVX2 func BroadcastInt32x4(x int32) Int32x4 { var z Int32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastInt64x2 returns a vector with the input @@ -37,7 +37,7 @@ func BroadcastInt32x4(x int32) Int32x4 { // Emulated, CPU Feature: AVX2 func BroadcastInt64x2(x int64) Int64x2 { var z Int64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastUint8x16 returns a vector with the input @@ -46,7 +46,7 @@ func BroadcastInt64x2(x int64) Int64x2 { // Emulated, CPU Feature: AVX2 func BroadcastUint8x16(x uint8) Uint8x16 { var z Uint8x16 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint16x8 returns a vector with the input @@ -55,7 +55,7 @@ func BroadcastUint8x16(x uint8) Uint8x16 { // Emulated, CPU Feature: AVX2 func BroadcastUint16x8(x uint16) Uint16x8 { var z Uint16x8 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint32x4 returns a vector with the input @@ -64,7 +64,7 @@ func BroadcastUint16x8(x uint16) Uint16x8 { // Emulated, CPU Feature: AVX2 func BroadcastUint32x4(x uint32) Uint32x4 { var z Uint32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastUint64x2 returns a vector with the input @@ -73,7 +73,7 @@ func BroadcastUint32x4(x uint32) Uint32x4 { // Emulated, CPU Feature: AVX2 func BroadcastUint64x2(x uint64) Uint64x2 { var z Uint64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastFloat32x4 returns a vector with the input @@ -82,7 +82,7 @@ func BroadcastUint64x2(x uint64) Uint64x2 { // Emulated, CPU Feature: AVX2 func BroadcastFloat32x4(x float32) Float32x4 { var z Float32x4 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastFloat64x2 returns a vector with the input @@ -91,7 +91,7 @@ func BroadcastFloat32x4(x float32) Float32x4 { // Emulated, CPU Feature: AVX2 func BroadcastFloat64x2(x float64) Float64x2 { var z Float64x2 - return z.SetElem(0, x).Broadcast128() + return z.SetElem(0, x).Broadcast1To2() } // BroadcastInt8x32 returns a vector with the input @@ -100,7 +100,7 @@ func BroadcastFloat64x2(x float64) Float64x2 { // Emulated, CPU Feature: AVX2 func BroadcastInt8x32(x int8) Int8x32 { var z Int8x16 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastInt16x16 returns a vector with the input @@ -109,7 +109,7 @@ func BroadcastInt8x32(x int8) Int8x32 { // Emulated, CPU Feature: AVX2 func BroadcastInt16x16(x int16) Int16x16 { var z Int16x8 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt32x8 returns a vector with the input @@ -118,7 +118,7 @@ func BroadcastInt16x16(x int16) Int16x16 { // Emulated, CPU Feature: AVX2 func BroadcastInt32x8(x int32) Int32x8 { var z Int32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastInt64x4 returns a vector with the input @@ -127,7 +127,7 @@ func BroadcastInt32x8(x int32) Int32x8 { // Emulated, CPU Feature: AVX2 func BroadcastInt64x4(x int64) Int64x4 { var z Int64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastUint8x32 returns a vector with the input @@ -136,7 +136,7 @@ func BroadcastInt64x4(x int64) Int64x4 { // Emulated, CPU Feature: AVX2 func BroadcastUint8x32(x uint8) Uint8x32 { var z Uint8x16 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastUint16x16 returns a vector with the input @@ -145,7 +145,7 @@ func BroadcastUint8x32(x uint8) Uint8x32 { // Emulated, CPU Feature: AVX2 func BroadcastUint16x16(x uint16) Uint16x16 { var z Uint16x8 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint32x8 returns a vector with the input @@ -154,7 +154,7 @@ func BroadcastUint16x16(x uint16) Uint16x16 { // Emulated, CPU Feature: AVX2 func BroadcastUint32x8(x uint32) Uint32x8 { var z Uint32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint64x4 returns a vector with the input @@ -163,7 +163,7 @@ func BroadcastUint32x8(x uint32) Uint32x8 { // Emulated, CPU Feature: AVX2 func BroadcastUint64x4(x uint64) Uint64x4 { var z Uint64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastFloat32x8 returns a vector with the input @@ -172,7 +172,7 @@ func BroadcastUint64x4(x uint64) Uint64x4 { // Emulated, CPU Feature: AVX2 func BroadcastFloat32x8(x float32) Float32x8 { var z Float32x4 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastFloat64x4 returns a vector with the input @@ -181,7 +181,7 @@ func BroadcastFloat32x8(x float32) Float32x8 { // Emulated, CPU Feature: AVX2 func BroadcastFloat64x4(x float64) Float64x4 { var z Float64x2 - return z.SetElem(0, x).Broadcast256() + return z.SetElem(0, x).Broadcast1To4() } // BroadcastInt8x64 returns a vector with the input @@ -190,7 +190,7 @@ func BroadcastFloat64x4(x float64) Float64x4 { // Emulated, CPU Feature: AVX512BW func BroadcastInt8x64(x int8) Int8x64 { var z Int8x16 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To64() } // BroadcastInt16x32 returns a vector with the input @@ -199,7 +199,7 @@ func BroadcastInt8x64(x int8) Int8x64 { // Emulated, CPU Feature: AVX512BW func BroadcastInt16x32(x int16) Int16x32 { var z Int16x8 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastInt32x16 returns a vector with the input @@ -208,7 +208,7 @@ func BroadcastInt16x32(x int16) Int16x32 { // Emulated, CPU Feature: AVX512F func BroadcastInt32x16(x int32) Int32x16 { var z Int32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastInt64x8 returns a vector with the input @@ -217,7 +217,7 @@ func BroadcastInt32x16(x int32) Int32x16 { // Emulated, CPU Feature: AVX512F func BroadcastInt64x8(x int64) Int64x8 { var z Int64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastUint8x64 returns a vector with the input @@ -226,7 +226,7 @@ func BroadcastInt64x8(x int64) Int64x8 { // Emulated, CPU Feature: AVX512BW func BroadcastUint8x64(x uint8) Uint8x64 { var z Uint8x16 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To64() } // BroadcastUint16x32 returns a vector with the input @@ -235,7 +235,7 @@ func BroadcastUint8x64(x uint8) Uint8x64 { // Emulated, CPU Feature: AVX512BW func BroadcastUint16x32(x uint16) Uint16x32 { var z Uint16x8 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To32() } // BroadcastUint32x16 returns a vector with the input @@ -244,7 +244,7 @@ func BroadcastUint16x32(x uint16) Uint16x32 { // Emulated, CPU Feature: AVX512F func BroadcastUint32x16(x uint32) Uint32x16 { var z Uint32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastUint64x8 returns a vector with the input @@ -253,7 +253,7 @@ func BroadcastUint32x16(x uint32) Uint32x16 { // Emulated, CPU Feature: AVX512F func BroadcastUint64x8(x uint64) Uint64x8 { var z Uint64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // BroadcastFloat32x16 returns a vector with the input @@ -262,7 +262,7 @@ func BroadcastUint64x8(x uint64) Uint64x8 { // Emulated, CPU Feature: AVX512F func BroadcastFloat32x16(x float32) Float32x16 { var z Float32x4 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To16() } // BroadcastFloat64x8 returns a vector with the input @@ -271,7 +271,7 @@ func BroadcastFloat32x16(x float32) Float32x16 { // Emulated, CPU Feature: AVX512F func BroadcastFloat64x8(x float64) Float64x8 { var z Float64x2 - return z.SetElem(0, x).Broadcast512() + return z.SetElem(0, x).Broadcast1To8() } // ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero. -- 2.52.0