From 3f92aa1ecae1f935731cffefcfe3a400e284ab82 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Fri, 1 Aug 2025 19:13:13 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: make bitwise logic ops available to all u?int vectors This CL is generated by CL 692555. Change-Id: I24e6de83e0408576f385a1c8e861b08c583f9098 Reviewed-on: https://go-review.googlesource.com/c/go/+/692356 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- .../compile/internal/ssa/_gen/simdAMD64.rules | 16 +++ .../internal/ssa/_gen/simdgenericOps.go | 16 +++ src/cmd/compile/internal/ssa/opGen.go | 108 ++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 48 ++++++++ .../compile/internal/ssagen/simdintrinsics.go | 16 +++ src/simd/binary_test.go | 8 +- src/simd/ops_amd64.go | 80 +++++++++++++ 7 files changed, 288 insertions(+), 4 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 7b7cbb9dc7..1d54cfcdbd 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -96,8 +96,10 @@ (AddSubFloat64x4 ...) => (VADDSUBPD256 ...) (AndInt8x16 ...) => (VPAND128 ...) (AndInt8x32 ...) => (VPAND256 ...) +(AndInt8x64 ...) => (VPANDD512 ...) (AndInt16x8 ...) => (VPAND128 ...) (AndInt16x16 ...) => (VPAND256 ...) +(AndInt16x32 ...) => (VPANDD512 ...) (AndInt32x4 ...) => (VPAND128 ...) (AndInt32x8 ...) => (VPAND256 ...) (AndInt32x16 ...) => (VPANDD512 ...) @@ -106,8 +108,10 @@ (AndInt64x8 ...) => (VPANDQ512 ...) (AndUint8x16 ...) => (VPAND128 ...) (AndUint8x32 ...) => (VPAND256 ...) +(AndUint8x64 ...) => (VPANDD512 ...) (AndUint16x8 ...) => (VPAND128 ...) (AndUint16x16 ...) => (VPAND256 ...) +(AndUint16x32 ...) => (VPANDD512 ...) (AndUint32x4 ...) => (VPAND128 ...) (AndUint32x8 ...) => (VPAND256 ...) (AndUint32x16 ...) => (VPANDD512 ...) @@ -128,8 +132,10 @@ (AndMaskedUint64x8 x y mask) => (VPANDQMasked512 x y (VPMOVVec64x8ToM mask)) (AndNotInt8x16 ...) => (VPANDN128 ...) (AndNotInt8x32 ...) => (VPANDN256 ...) +(AndNotInt8x64 ...) => (VPANDND512 ...) (AndNotInt16x8 ...) => (VPANDN128 ...) (AndNotInt16x16 ...) => (VPANDN256 ...) +(AndNotInt16x32 ...) => (VPANDND512 ...) (AndNotInt32x4 ...) => (VPANDN128 ...) (AndNotInt32x8 ...) => (VPANDN256 ...) (AndNotInt32x16 ...) => (VPANDND512 ...) @@ -138,8 +144,10 @@ (AndNotInt64x8 ...) => (VPANDNQ512 ...) (AndNotUint8x16 ...) => (VPANDN128 ...) (AndNotUint8x32 ...) => (VPANDN256 ...) +(AndNotUint8x64 ...) => (VPANDND512 ...) (AndNotUint16x8 ...) => (VPANDN128 ...) (AndNotUint16x16 ...) => (VPANDN256 ...) +(AndNotUint16x32 ...) => (VPANDND512 ...) (AndNotUint32x4 ...) => (VPANDN128 ...) (AndNotUint32x8 ...) => (VPANDN256 ...) (AndNotUint32x16 ...) => (VPANDND512 ...) @@ -967,8 +975,10 @@ (NotEqualMaskedUint64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPUQMasked512 [4] x y (VPMOVVec64x8ToM mask))) (OrInt8x16 ...) => (VPOR128 ...) (OrInt8x32 ...) => (VPOR256 ...) +(OrInt8x64 ...) => (VPORD512 ...) (OrInt16x8 ...) => (VPOR128 ...) (OrInt16x16 ...) => (VPOR256 ...) +(OrInt16x32 ...) => (VPORD512 ...) (OrInt32x4 ...) => (VPOR128 ...) (OrInt32x8 ...) => (VPOR256 ...) (OrInt32x16 ...) => (VPORD512 ...) @@ -977,8 +987,10 @@ (OrInt64x8 ...) => (VPORQ512 ...) (OrUint8x16 ...) => (VPOR128 ...) (OrUint8x32 ...) => (VPOR256 ...) +(OrUint8x64 ...) => (VPORD512 ...) (OrUint16x8 ...) => (VPOR128 ...) (OrUint16x16 ...) => (VPOR256 ...) +(OrUint16x32 ...) => (VPORD512 ...) (OrUint32x4 ...) => (VPOR128 ...) (OrUint32x8 ...) => (VPOR256 ...) (OrUint32x16 ...) => (VPORD512 ...) @@ -1773,8 +1785,10 @@ (UnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) (XorInt8x16 ...) => (VPXOR128 ...) (XorInt8x32 ...) => (VPXOR256 ...) +(XorInt8x64 ...) => (VPXORD512 ...) (XorInt16x8 ...) => (VPXOR128 ...) (XorInt16x16 ...) => (VPXOR256 ...) +(XorInt16x32 ...) => (VPXORD512 ...) (XorInt32x4 ...) => (VPXOR128 ...) (XorInt32x8 ...) => (VPXOR256 ...) (XorInt32x16 ...) => (VPXORD512 ...) @@ -1783,8 +1797,10 @@ (XorInt64x8 ...) => (VPXORQ512 ...) (XorUint8x16 ...) => (VPXOR128 ...) (XorUint8x32 ...) => (VPXOR256 ...) +(XorUint8x64 ...) => (VPXORD512 ...) (XorUint16x8 ...) => (VPXOR128 ...) (XorUint16x16 ...) => (VPXOR256 ...) +(XorUint16x32 ...) => (VPXORD512 ...) (XorUint32x4 ...) => (VPXOR128 ...) (XorUint32x8 ...) => (VPXOR256 ...) (XorUint32x16 ...) => (VPXORD512 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 6853c3b091..492a994e93 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -99,8 +99,10 @@ func simdGenericOps() []opData { {name: "AddUint64x8", argLength: 2, commutative: true}, {name: "AndInt8x16", argLength: 2, commutative: true}, {name: "AndInt8x32", argLength: 2, commutative: true}, + {name: "AndInt8x64", argLength: 2, commutative: true}, {name: "AndInt16x8", argLength: 2, commutative: true}, {name: "AndInt16x16", argLength: 2, commutative: true}, + {name: "AndInt16x32", argLength: 2, commutative: true}, {name: "AndInt32x4", argLength: 2, commutative: true}, {name: "AndInt32x8", argLength: 2, commutative: true}, {name: "AndInt32x16", argLength: 2, commutative: true}, @@ -121,8 +123,10 @@ func simdGenericOps() []opData { {name: "AndMaskedUint64x8", argLength: 3, commutative: true}, {name: "AndNotInt8x16", argLength: 2, commutative: false}, {name: "AndNotInt8x32", argLength: 2, commutative: false}, + {name: "AndNotInt8x64", argLength: 2, commutative: false}, {name: "AndNotInt16x8", argLength: 2, commutative: false}, {name: "AndNotInt16x16", argLength: 2, commutative: false}, + {name: "AndNotInt16x32", argLength: 2, commutative: false}, {name: "AndNotInt32x4", argLength: 2, commutative: false}, {name: "AndNotInt32x8", argLength: 2, commutative: false}, {name: "AndNotInt32x16", argLength: 2, commutative: false}, @@ -143,8 +147,10 @@ func simdGenericOps() []opData { {name: "AndNotMaskedUint64x8", argLength: 3, commutative: false}, {name: "AndNotUint8x16", argLength: 2, commutative: false}, {name: "AndNotUint8x32", argLength: 2, commutative: false}, + {name: "AndNotUint8x64", argLength: 2, commutative: false}, {name: "AndNotUint16x8", argLength: 2, commutative: false}, {name: "AndNotUint16x16", argLength: 2, commutative: false}, + {name: "AndNotUint16x32", argLength: 2, commutative: false}, {name: "AndNotUint32x4", argLength: 2, commutative: false}, {name: "AndNotUint32x8", argLength: 2, commutative: false}, {name: "AndNotUint32x16", argLength: 2, commutative: false}, @@ -153,8 +159,10 @@ func simdGenericOps() []opData { {name: "AndNotUint64x8", argLength: 2, commutative: false}, {name: "AndUint8x16", argLength: 2, commutative: true}, {name: "AndUint8x32", argLength: 2, commutative: true}, + {name: "AndUint8x64", argLength: 2, commutative: true}, {name: "AndUint16x8", argLength: 2, commutative: true}, {name: "AndUint16x16", argLength: 2, commutative: true}, + {name: "AndUint16x32", argLength: 2, commutative: true}, {name: "AndUint32x4", argLength: 2, commutative: true}, {name: "AndUint32x8", argLength: 2, commutative: true}, {name: "AndUint32x16", argLength: 2, commutative: true}, @@ -868,8 +876,10 @@ func simdGenericOps() []opData { {name: "NotEqualUint64x8", argLength: 2, commutative: true}, {name: "OrInt8x16", argLength: 2, commutative: true}, {name: "OrInt8x32", argLength: 2, commutative: true}, + {name: "OrInt8x64", argLength: 2, commutative: true}, {name: "OrInt16x8", argLength: 2, commutative: true}, {name: "OrInt16x16", argLength: 2, commutative: true}, + {name: "OrInt16x32", argLength: 2, commutative: true}, {name: "OrInt32x4", argLength: 2, commutative: true}, {name: "OrInt32x8", argLength: 2, commutative: true}, {name: "OrInt32x16", argLength: 2, commutative: true}, @@ -890,8 +900,10 @@ func simdGenericOps() []opData { {name: "OrMaskedUint64x8", argLength: 3, commutative: true}, {name: "OrUint8x16", argLength: 2, commutative: true}, {name: "OrUint8x32", argLength: 2, commutative: true}, + {name: "OrUint8x64", argLength: 2, commutative: true}, {name: "OrUint16x8", argLength: 2, commutative: true}, {name: "OrUint16x16", argLength: 2, commutative: true}, + {name: "OrUint16x32", argLength: 2, commutative: true}, {name: "OrUint32x4", argLength: 2, commutative: true}, {name: "OrUint32x8", argLength: 2, commutative: true}, {name: "OrUint32x16", argLength: 2, commutative: true}, @@ -1512,8 +1524,10 @@ func simdGenericOps() []opData { {name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, {name: "XorInt8x16", argLength: 2, commutative: true}, {name: "XorInt8x32", argLength: 2, commutative: true}, + {name: "XorInt8x64", argLength: 2, commutative: true}, {name: "XorInt16x8", argLength: 2, commutative: true}, {name: "XorInt16x16", argLength: 2, commutative: true}, + {name: "XorInt16x32", argLength: 2, commutative: true}, {name: "XorInt32x4", argLength: 2, commutative: true}, {name: "XorInt32x8", argLength: 2, commutative: true}, {name: "XorInt32x16", argLength: 2, commutative: true}, @@ -1534,8 +1548,10 @@ func simdGenericOps() []opData { {name: "XorMaskedUint64x8", argLength: 3, commutative: true}, {name: "XorUint8x16", argLength: 2, commutative: true}, {name: "XorUint8x32", argLength: 2, commutative: true}, + {name: "XorUint8x64", argLength: 2, commutative: true}, {name: "XorUint16x8", argLength: 2, commutative: true}, {name: "XorUint16x16", argLength: 2, commutative: true}, + {name: "XorUint16x32", argLength: 2, commutative: true}, {name: "XorUint32x4", argLength: 2, commutative: true}, {name: "XorUint32x8", argLength: 2, commutative: true}, {name: "XorUint32x16", argLength: 2, commutative: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7427137b22..e8a5354c00 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -4585,8 +4585,10 @@ const ( OpAddUint64x8 OpAndInt8x16 OpAndInt8x32 + OpAndInt8x64 OpAndInt16x8 OpAndInt16x16 + OpAndInt16x32 OpAndInt32x4 OpAndInt32x8 OpAndInt32x16 @@ -4607,8 +4609,10 @@ const ( OpAndMaskedUint64x8 OpAndNotInt8x16 OpAndNotInt8x32 + OpAndNotInt8x64 OpAndNotInt16x8 OpAndNotInt16x16 + OpAndNotInt16x32 OpAndNotInt32x4 OpAndNotInt32x8 OpAndNotInt32x16 @@ -4629,8 +4633,10 @@ const ( OpAndNotMaskedUint64x8 OpAndNotUint8x16 OpAndNotUint8x32 + OpAndNotUint8x64 OpAndNotUint16x8 OpAndNotUint16x16 + OpAndNotUint16x32 OpAndNotUint32x4 OpAndNotUint32x8 OpAndNotUint32x16 @@ -4639,8 +4645,10 @@ const ( OpAndNotUint64x8 OpAndUint8x16 OpAndUint8x32 + OpAndUint8x64 OpAndUint16x8 OpAndUint16x16 + OpAndUint16x32 OpAndUint32x4 OpAndUint32x8 OpAndUint32x16 @@ -5354,8 +5362,10 @@ const ( OpNotEqualUint64x8 OpOrInt8x16 OpOrInt8x32 + OpOrInt8x64 OpOrInt16x8 OpOrInt16x16 + OpOrInt16x32 OpOrInt32x4 OpOrInt32x8 OpOrInt32x16 @@ -5376,8 +5386,10 @@ const ( OpOrMaskedUint64x8 OpOrUint8x16 OpOrUint8x32 + OpOrUint8x64 OpOrUint16x8 OpOrUint16x16 + OpOrUint16x32 OpOrUint32x4 OpOrUint32x8 OpOrUint32x16 @@ -5998,8 +6010,10 @@ const ( OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 OpXorInt8x16 OpXorInt8x32 + OpXorInt8x64 OpXorInt16x8 OpXorInt16x16 + OpXorInt16x32 OpXorInt32x4 OpXorInt32x8 OpXorInt32x16 @@ -6020,8 +6034,10 @@ const ( OpXorMaskedUint64x8 OpXorUint8x16 OpXorUint8x32 + OpXorUint8x64 OpXorUint16x8 OpXorUint16x16 + OpXorUint16x32 OpXorUint32x4 OpXorUint32x8 OpXorUint32x16 @@ -62211,6 +62227,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "AndInt8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "AndInt16x8", argLen: 2, @@ -62223,6 +62245,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "AndInt16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "AndInt32x4", argLen: 2, @@ -62341,6 +62369,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "AndNotInt8x64", + argLen: 2, + generic: true, + }, { name: "AndNotInt16x8", argLen: 2, @@ -62351,6 +62384,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "AndNotInt16x32", + argLen: 2, + generic: true, + }, { name: "AndNotInt32x4", argLen: 2, @@ -62451,6 +62489,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "AndNotUint8x64", + argLen: 2, + generic: true, + }, { name: "AndNotUint16x8", argLen: 2, @@ -62461,6 +62504,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "AndNotUint16x32", + argLen: 2, + generic: true, + }, { name: "AndNotUint32x4", argLen: 2, @@ -62503,6 +62551,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "AndUint8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "AndUint16x8", argLen: 2, @@ -62515,6 +62569,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "AndUint16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "AndUint32x4", argLen: 2, @@ -66413,6 +66473,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "OrInt8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "OrInt16x8", argLen: 2, @@ -66425,6 +66491,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "OrInt16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "OrInt32x4", argLen: 2, @@ -66545,6 +66617,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "OrUint8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "OrUint16x8", argLen: 2, @@ -66557,6 +66635,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "OrUint16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "OrUint32x4", argLen: 2, @@ -69689,6 +69773,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "XorInt8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "XorInt16x8", argLen: 2, @@ -69701,6 +69791,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "XorInt16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "XorInt32x4", argLen: 2, @@ -69821,6 +69917,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "XorUint8x64", + argLen: 2, + commutative: true, + generic: true, + }, { name: "XorUint16x8", argLen: 2, @@ -69833,6 +69935,12 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "XorUint16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "XorUint32x4", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 5abb50ab71..82f13b43c6 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -831,6 +831,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndInt16x16: v.Op = OpAMD64VPAND256 return true + case OpAndInt16x32: + v.Op = OpAMD64VPANDD512 + return true case OpAndInt16x8: v.Op = OpAMD64VPAND128 return true @@ -858,6 +861,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndInt8x32: v.Op = OpAMD64VPAND256 return true + case OpAndInt8x64: + v.Op = OpAMD64VPANDD512 + return true case OpAndMaskedInt32x16: return rewriteValueAMD64_OpAndMaskedInt32x16(v) case OpAndMaskedInt32x4: @@ -885,6 +891,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndNotInt16x16: v.Op = OpAMD64VPANDN256 return true + case OpAndNotInt16x32: + v.Op = OpAMD64VPANDND512 + return true case OpAndNotInt16x8: v.Op = OpAMD64VPANDN128 return true @@ -912,6 +921,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndNotInt8x32: v.Op = OpAMD64VPANDN256 return true + case OpAndNotInt8x64: + v.Op = OpAMD64VPANDND512 + return true case OpAndNotMaskedInt32x16: return rewriteValueAMD64_OpAndNotMaskedInt32x16(v) case OpAndNotMaskedInt32x4: @@ -939,6 +951,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndNotUint16x16: v.Op = OpAMD64VPANDN256 return true + case OpAndNotUint16x32: + v.Op = OpAMD64VPANDND512 + return true case OpAndNotUint16x8: v.Op = OpAMD64VPANDN128 return true @@ -966,9 +981,15 @@ func rewriteValueAMD64(v *Value) bool { case OpAndNotUint8x32: v.Op = OpAMD64VPANDN256 return true + case OpAndNotUint8x64: + v.Op = OpAMD64VPANDND512 + return true case OpAndUint16x16: v.Op = OpAMD64VPAND256 return true + case OpAndUint16x32: + v.Op = OpAMD64VPANDD512 + return true case OpAndUint16x8: v.Op = OpAMD64VPAND128 return true @@ -996,6 +1017,9 @@ func rewriteValueAMD64(v *Value) bool { case OpAndUint8x32: v.Op = OpAMD64VPAND256 return true + case OpAndUint8x64: + v.Op = OpAMD64VPANDD512 + return true case OpApproximateReciprocalFloat32x16: v.Op = OpAMD64VRCP14PS512 return true @@ -3274,6 +3298,9 @@ func rewriteValueAMD64(v *Value) bool { case OpOrInt16x16: v.Op = OpAMD64VPOR256 return true + case OpOrInt16x32: + v.Op = OpAMD64VPORD512 + return true case OpOrInt16x8: v.Op = OpAMD64VPOR128 return true @@ -3301,6 +3328,9 @@ func rewriteValueAMD64(v *Value) bool { case OpOrInt8x32: v.Op = OpAMD64VPOR256 return true + case OpOrInt8x64: + v.Op = OpAMD64VPORD512 + return true case OpOrMaskedInt32x16: return rewriteValueAMD64_OpOrMaskedInt32x16(v) case OpOrMaskedInt32x4: @@ -3328,6 +3358,9 @@ func rewriteValueAMD64(v *Value) bool { case OpOrUint16x16: v.Op = OpAMD64VPOR256 return true + case OpOrUint16x32: + v.Op = OpAMD64VPORD512 + return true case OpOrUint16x8: v.Op = OpAMD64VPOR128 return true @@ -3355,6 +3388,9 @@ func rewriteValueAMD64(v *Value) bool { case OpOrUint8x32: v.Op = OpAMD64VPOR256 return true + case OpOrUint8x64: + v.Op = OpAMD64VPORD512 + return true case OpPairDotProdInt16x16: v.Op = OpAMD64VPMADDWD256 return true @@ -5537,6 +5573,9 @@ func rewriteValueAMD64(v *Value) bool { case OpXorInt16x16: v.Op = OpAMD64VPXOR256 return true + case OpXorInt16x32: + v.Op = OpAMD64VPXORD512 + return true case OpXorInt16x8: v.Op = OpAMD64VPXOR128 return true @@ -5564,6 +5603,9 @@ func rewriteValueAMD64(v *Value) bool { case OpXorInt8x32: v.Op = OpAMD64VPXOR256 return true + case OpXorInt8x64: + v.Op = OpAMD64VPXORD512 + return true case OpXorMaskedInt32x16: return rewriteValueAMD64_OpXorMaskedInt32x16(v) case OpXorMaskedInt32x4: @@ -5591,6 +5633,9 @@ func rewriteValueAMD64(v *Value) bool { case OpXorUint16x16: v.Op = OpAMD64VPXOR256 return true + case OpXorUint16x32: + v.Op = OpAMD64VPXORD512 + return true case OpXorUint16x8: v.Op = OpAMD64VPXOR128 return true @@ -5618,6 +5663,9 @@ func rewriteValueAMD64(v *Value) bool { case OpXorUint8x32: v.Op = OpAMD64VPXOR256 return true + case OpXorUint8x64: + v.Op = OpAMD64VPXORD512 + return true case OpZero: return rewriteValueAMD64_OpZero(v) case OpZeroExt16to32: diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 12c388ca91..7a7367ee1e 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -107,8 +107,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x4.AddSub", opLen2(ssa.OpAddSubFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x16.And", opLen2(ssa.OpAndInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.And", opLen2(ssa.OpAndInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.And", opLen2(ssa.OpAndInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.And", opLen2(ssa.OpAndInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.And", opLen2(ssa.OpAndInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.And", opLen2(ssa.OpAndInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) @@ -117,8 +119,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x8.And", opLen2(ssa.OpAndInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.And", opLen2(ssa.OpAndUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.And", opLen2(ssa.OpAndUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.And", opLen2(ssa.OpAndUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint16x8.And", opLen2(ssa.OpAndUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.And", opLen2(ssa.OpAndUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.And", opLen2(ssa.OpAndUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.And", opLen2(ssa.OpAndUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.And", opLen2(ssa.OpAndUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.And", opLen2(ssa.OpAndUint32x16, types.TypeVec512), sys.AMD64) @@ -139,8 +143,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x8.AndMasked", opLen3(ssa.OpAndMaskedUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.AndNot", opLen2_21(ssa.OpAndNotInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.AndNot", opLen2_21(ssa.OpAndNotInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AndNot", opLen2_21(ssa.OpAndNotInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.AndNot", opLen2_21(ssa.OpAndNotInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.AndNot", opLen2_21(ssa.OpAndNotInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.AndNot", opLen2_21(ssa.OpAndNotInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.AndNot", opLen2_21(ssa.OpAndNotInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.AndNot", opLen2_21(ssa.OpAndNotInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.AndNot", opLen2_21(ssa.OpAndNotInt32x16, types.TypeVec512), sys.AMD64) @@ -149,8 +155,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x8.AndNot", opLen2_21(ssa.OpAndNotInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.AndNot", opLen2_21(ssa.OpAndNotUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.AndNot", opLen2_21(ssa.OpAndNotUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.AndNot", opLen2_21(ssa.OpAndNotUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint16x8.AndNot", opLen2_21(ssa.OpAndNotUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.AndNot", opLen2_21(ssa.OpAndNotUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.AndNot", opLen2_21(ssa.OpAndNotUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.AndNot", opLen2_21(ssa.OpAndNotUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.AndNot", opLen2_21(ssa.OpAndNotUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.AndNot", opLen2_21(ssa.OpAndNotUint32x16, types.TypeVec512), sys.AMD64) @@ -978,8 +986,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x8.NotEqualMasked", opLen3(ssa.OpNotEqualMaskedUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Or", opLen2(ssa.OpOrInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Or", opLen2(ssa.OpOrInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Or", opLen2(ssa.OpOrInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.Or", opLen2(ssa.OpOrInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.Or", opLen2(ssa.OpOrInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Or", opLen2(ssa.OpOrInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) @@ -988,8 +998,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x8.Or", opLen2(ssa.OpOrInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.Or", opLen2(ssa.OpOrUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Or", opLen2(ssa.OpOrUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.Or", opLen2(ssa.OpOrUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint16x8.Or", opLen2(ssa.OpOrUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.Or", opLen2(ssa.OpOrUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.Or", opLen2(ssa.OpOrUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.Or", opLen2(ssa.OpOrUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.Or", opLen2(ssa.OpOrUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.Or", opLen2(ssa.OpOrUint32x16, types.TypeVec512), sys.AMD64) @@ -1784,8 +1796,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Xor", opLen2(ssa.OpXorInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Xor", opLen2(ssa.OpXorInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Xor", opLen2(ssa.OpXorInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.Xor", opLen2(ssa.OpXorInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.Xor", opLen2(ssa.OpXorInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Xor", opLen2(ssa.OpXorInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.Xor", opLen2(ssa.OpXorInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.Xor", opLen2(ssa.OpXorInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.Xor", opLen2(ssa.OpXorInt32x16, types.TypeVec512), sys.AMD64) @@ -1794,8 +1808,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x8.Xor", opLen2(ssa.OpXorInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.Xor", opLen2(ssa.OpXorUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Xor", opLen2(ssa.OpXorUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.Xor", opLen2(ssa.OpXorUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint16x8.Xor", opLen2(ssa.OpXorUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.Xor", opLen2(ssa.OpXorUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.Xor", opLen2(ssa.OpXorUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.Xor", opLen2(ssa.OpXorUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.Xor", opLen2(ssa.OpXorUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.Xor", opLen2(ssa.OpXorUint32x16, types.TypeVec512), sys.AMD64) diff --git a/src/simd/binary_test.go b/src/simd/binary_test.go index 4221e74144..b7daf736f4 100644 --- a/src/simd/binary_test.go +++ b/src/simd/binary_test.go @@ -230,12 +230,12 @@ func TestAndNot(t *testing.T) { testUint8x32Binary(t, simd.Uint8x32.AndNot, andNotSlice[uint8]) if simd.HasAVX512() { - // testInt8x64Binary(t, simd.Int8x64.AndNot, andNotSlice[int8]) // missing - // testInt16x32Binary(t, simd.Int16x32.AndNot, andNotSlice[int16]) // missing + testInt8x64Binary(t, simd.Int8x64.AndNot, andNotSlice[int8]) + testInt16x32Binary(t, simd.Int16x32.AndNot, andNotSlice[int16]) testInt32x16Binary(t, simd.Int32x16.AndNot, andNotSlice[int32]) testInt64x8Binary(t, simd.Int64x8.AndNot, andNotSlice[int64]) - // testUint8x64Binary(t, simd.Uint8x64.AndNot, andNotSlice[uint8]) // missing - // testUint16x32Binary(t, simd.Uint16x32.AndNot, andNotSlice[uint16]) // missing + testUint8x64Binary(t, simd.Uint8x64.AndNot, andNotSlice[uint8]) + testUint16x32Binary(t, simd.Uint16x32.AndNot, andNotSlice[uint16]) testUint32x16Binary(t, simd.Uint32x16.AndNot, andNotSlice[uint32]) testUint64x8Binary(t, simd.Uint64x8.AndNot, andNotSlice[uint64]) } diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index ea0c598157..5776350fe9 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -590,6 +590,11 @@ func (x Int8x16) And(y Int8x16) Int8x16 // Asm: VPAND, CPU Feature: AVX2 func (x Int8x32) And(y Int8x32) Int8x32 +// And performs a bitwise AND operation between two vectors. +// +// Asm: VPANDD, CPU Feature: AVX512F +func (x Int8x64) And(y Int8x64) Int8x64 + // And performs a bitwise AND operation between two vectors. // // Asm: VPAND, CPU Feature: AVX @@ -600,6 +605,11 @@ func (x Int16x8) And(y Int16x8) Int16x8 // Asm: VPAND, CPU Feature: AVX2 func (x Int16x16) And(y Int16x16) Int16x16 +// And performs a bitwise AND operation between two vectors. +// +// Asm: VPANDD, CPU Feature: AVX512F +func (x Int16x32) And(y Int16x32) Int16x32 + // And performs a bitwise AND operation between two vectors. // // Asm: VPAND, CPU Feature: AVX @@ -640,6 +650,11 @@ func (x Uint8x16) And(y Uint8x16) Uint8x16 // Asm: VPAND, CPU Feature: AVX2 func (x Uint8x32) And(y Uint8x32) Uint8x32 +// And performs a bitwise AND operation between two vectors. +// +// Asm: VPANDD, CPU Feature: AVX512F +func (x Uint8x64) And(y Uint8x64) Uint8x64 + // And performs a bitwise AND operation between two vectors. // // Asm: VPAND, CPU Feature: AVX @@ -650,6 +665,11 @@ func (x Uint16x8) And(y Uint16x8) Uint16x8 // Asm: VPAND, CPU Feature: AVX2 func (x Uint16x16) And(y Uint16x16) Uint16x16 +// And performs a bitwise AND operation between two vectors. +// +// Asm: VPANDD, CPU Feature: AVX512F +func (x Uint16x32) And(y Uint16x32) Uint16x32 + // And performs a bitwise AND operation between two vectors. // // Asm: VPAND, CPU Feature: AVX @@ -778,6 +798,11 @@ func (x Int8x16) AndNot(y Int8x16) Int8x16 // Asm: VPANDN, CPU Feature: AVX2 func (x Int8x32) AndNot(y Int8x32) Int8x32 +// AndNot performs a bitwise x &^ y. +// +// Asm: VPANDND, CPU Feature: AVX512F +func (x Int8x64) AndNot(y Int8x64) Int8x64 + // AndNot performs a bitwise x &^ y. // // Asm: VPANDN, CPU Feature: AVX @@ -788,6 +813,11 @@ func (x Int16x8) AndNot(y Int16x8) Int16x8 // Asm: VPANDN, CPU Feature: AVX2 func (x Int16x16) AndNot(y Int16x16) Int16x16 +// AndNot performs a bitwise x &^ y. +// +// Asm: VPANDND, CPU Feature: AVX512F +func (x Int16x32) AndNot(y Int16x32) Int16x32 + // AndNot performs a bitwise x &^ y. // // Asm: VPANDN, CPU Feature: AVX @@ -828,6 +858,11 @@ func (x Uint8x16) AndNot(y Uint8x16) Uint8x16 // Asm: VPANDN, CPU Feature: AVX2 func (x Uint8x32) AndNot(y Uint8x32) Uint8x32 +// AndNot performs a bitwise x &^ y. +// +// Asm: VPANDND, CPU Feature: AVX512F +func (x Uint8x64) AndNot(y Uint8x64) Uint8x64 + // AndNot performs a bitwise x &^ y. // // Asm: VPANDN, CPU Feature: AVX @@ -838,6 +873,11 @@ func (x Uint16x8) AndNot(y Uint16x8) Uint16x8 // Asm: VPANDN, CPU Feature: AVX2 func (x Uint16x16) AndNot(y Uint16x16) Uint16x16 +// AndNot performs a bitwise x &^ y. +// +// Asm: VPANDND, CPU Feature: AVX512F +func (x Uint16x32) AndNot(y Uint16x32) Uint16x32 + // AndNot performs a bitwise x &^ y. // // Asm: VPANDN, CPU Feature: AVX @@ -6183,6 +6223,11 @@ func (x Int8x16) Or(y Int8x16) Int8x16 // Asm: VPOR, CPU Feature: AVX2 func (x Int8x32) Or(y Int8x32) Int8x32 +// Or performs a bitwise OR operation between two vectors. +// +// Asm: VPORD, CPU Feature: AVX512F +func (x Int8x64) Or(y Int8x64) Int8x64 + // Or performs a bitwise OR operation between two vectors. // // Asm: VPOR, CPU Feature: AVX @@ -6193,6 +6238,11 @@ func (x Int16x8) Or(y Int16x8) Int16x8 // Asm: VPOR, CPU Feature: AVX2 func (x Int16x16) Or(y Int16x16) Int16x16 +// Or performs a bitwise OR operation between two vectors. +// +// Asm: VPORD, CPU Feature: AVX512F +func (x Int16x32) Or(y Int16x32) Int16x32 + // Or performs a bitwise OR operation between two vectors. // // Asm: VPOR, CPU Feature: AVX @@ -6233,6 +6283,11 @@ func (x Uint8x16) Or(y Uint8x16) Uint8x16 // Asm: VPOR, CPU Feature: AVX2 func (x Uint8x32) Or(y Uint8x32) Uint8x32 +// Or performs a bitwise OR operation between two vectors. +// +// Asm: VPORD, CPU Feature: AVX512F +func (x Uint8x64) Or(y Uint8x64) Uint8x64 + // Or performs a bitwise OR operation between two vectors. // // Asm: VPOR, CPU Feature: AVX @@ -6243,6 +6298,11 @@ func (x Uint16x8) Or(y Uint16x8) Uint16x8 // Asm: VPOR, CPU Feature: AVX2 func (x Uint16x16) Or(y Uint16x16) Uint16x16 +// Or performs a bitwise OR operation between two vectors. +// +// Asm: VPORD, CPU Feature: AVX512F +func (x Uint16x32) Or(y Uint16x32) Uint16x32 + // Or performs a bitwise OR operation between two vectors. // // Asm: VPOR, CPU Feature: AVX @@ -11867,6 +11927,11 @@ func (x Int8x16) Xor(y Int8x16) Int8x16 // Asm: VPXOR, CPU Feature: AVX2 func (x Int8x32) Xor(y Int8x32) Int8x32 +// Xor performs a bitwise XOR operation between two vectors. +// +// Asm: VPXORD, CPU Feature: AVX512F +func (x Int8x64) Xor(y Int8x64) Int8x64 + // Xor performs a bitwise XOR operation between two vectors. // // Asm: VPXOR, CPU Feature: AVX @@ -11877,6 +11942,11 @@ func (x Int16x8) Xor(y Int16x8) Int16x8 // Asm: VPXOR, CPU Feature: AVX2 func (x Int16x16) Xor(y Int16x16) Int16x16 +// Xor performs a bitwise XOR operation between two vectors. +// +// Asm: VPXORD, CPU Feature: AVX512F +func (x Int16x32) Xor(y Int16x32) Int16x32 + // Xor performs a bitwise XOR operation between two vectors. // // Asm: VPXOR, CPU Feature: AVX @@ -11917,6 +11987,11 @@ func (x Uint8x16) Xor(y Uint8x16) Uint8x16 // Asm: VPXOR, CPU Feature: AVX2 func (x Uint8x32) Xor(y Uint8x32) Uint8x32 +// Xor performs a bitwise XOR operation between two vectors. +// +// Asm: VPXORD, CPU Feature: AVX512F +func (x Uint8x64) Xor(y Uint8x64) Uint8x64 + // Xor performs a bitwise XOR operation between two vectors. // // Asm: VPXOR, CPU Feature: AVX @@ -11927,6 +12002,11 @@ func (x Uint16x8) Xor(y Uint16x8) Uint16x8 // Asm: VPXOR, CPU Feature: AVX2 func (x Uint16x16) Xor(y Uint16x16) Uint16x16 +// Xor performs a bitwise XOR operation between two vectors. +// +// Asm: VPXORD, CPU Feature: AVX512F +func (x Uint16x32) Xor(y Uint16x32) Uint16x32 + // Xor performs a bitwise XOR operation between two vectors. // // Asm: VPXOR, CPU Feature: AVX -- 2.52.0