From: Junyang Shao Date: Thu, 21 Aug 2025 19:11:30 +0000 (+0000) Subject: [dev.simd] cmd/compile, simd: make Permute 128-bit use AVX VPSHUFB X-Git-Tag: go1.26rc1~147^2~99 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=fa1e78c9adf6377fd2797ee50cb8210f0bd34781;p=gostls13.git [dev.simd] cmd/compile, simd: make Permute 128-bit use AVX VPSHUFB Change-Id: Ib89f602f797065e411eb0cbc95ccf2748b25fdec Reviewed-on: https://go-review.googlesource.com/c/go/+/698295 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index e4b0ca7a23..5930ec9965 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -332,7 +332,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPOR256, ssa.OpAMD64VPORD512, ssa.OpAMD64VPORQ512, - ssa.OpAMD64VPERMB128, + ssa.OpAMD64VPSHUFB128, ssa.OpAMD64VPERMB256, ssa.OpAMD64VPERMB512, ssa.OpAMD64VPERMW128, @@ -606,7 +606,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, - ssa.OpAMD64VPERMBMasked128, + ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMWMasked128, @@ -1682,7 +1682,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMI2QMasked256, ssa.OpAMD64VPERMI2PDMasked512, ssa.OpAMD64VPERMI2QMasked512, - ssa.OpAMD64VPERMBMasked128, + ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMWMasked128, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index c6dd5a38ce..f1337d70be 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -732,7 +732,7 @@ (PermuteFloat32x16 ...) => (VPERMPS512 ...) (PermuteFloat64x4 ...) => (VPERMPD256 ...) (PermuteFloat64x8 ...) => (VPERMPD512 ...) -(PermuteInt8x16 ...) => (VPERMB128 ...) +(PermuteInt8x16 ...) => (VPSHUFB128 ...) (PermuteInt8x32 ...) => (VPERMB256 ...) (PermuteInt8x64 ...) => (VPERMB512 ...) (PermuteInt16x8 ...) => (VPERMW128 ...) @@ -742,7 +742,7 @@ (PermuteInt32x16 ...) => (VPERMD512 ...) (PermuteInt64x4 ...) => (VPERMQ256 ...) (PermuteInt64x8 ...) => (VPERMQ512 ...) -(PermuteUint8x16 ...) => (VPERMB128 ...) +(PermuteUint8x16 ...) => (VPSHUFB128 ...) (PermuteUint8x32 ...) => (VPERMB256 ...) (PermuteUint8x64 ...) => (VPERMB512 ...) (PermuteUint16x8 ...) => (VPERMW128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index c4ef39a30e..96bb3ac032 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -364,10 +364,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPDPWSSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPDPWSSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPDPWSSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true}, - {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, - {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -817,6 +815,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHRDVWMasked128", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7249752130..9212b17a35 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1587,10 +1587,8 @@ const ( OpAMD64VPDPWSSDSMasked128 OpAMD64VPDPWSSDSMasked256 OpAMD64VPDPWSSDSMasked512 - OpAMD64VPERMB128 OpAMD64VPERMB256 OpAMD64VPERMB512 - OpAMD64VPERMBMasked128 OpAMD64VPERMBMasked256 OpAMD64VPERMBMasked512 OpAMD64VPERMD256 @@ -2040,6 +2038,8 @@ const ( OpAMD64VPSHRDVWMasked128 OpAMD64VPSHRDVWMasked256 OpAMD64VPSHRDVWMasked512 + OpAMD64VPSHUFB128 + OpAMD64VPSHUFBMasked128 OpAMD64VPSIGNB128 OpAMD64VPSIGNB256 OpAMD64VPSIGND128 @@ -24358,20 +24358,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPERMB128", - argLen: 2, - asm: x86.AVPERMB, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, { name: "VPERMB256", argLen: 2, @@ -24400,21 +24386,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPERMBMasked128", - argLen: 3, - asm: x86.AVPERMB, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPERMBMasked256", argLen: 3, @@ -31046,6 +31017,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFB128", + argLen: 2, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFBMasked128", + argLen: 3, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSIGNB128", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index fea6b047d1..e31b5f981f 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3257,7 +3257,7 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VPERMQ512 return true case OpPermuteInt8x16: - v.Op = OpAMD64VPERMB128 + v.Op = OpAMD64VPSHUFB128 return true case OpPermuteInt8x32: v.Op = OpAMD64VPERMB256 @@ -3287,7 +3287,7 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VPERMQ512 return true case OpPermuteUint8x16: - v.Op = OpAMD64VPERMB128 + v.Op = OpAMD64VPSHUFB128 return true case OpPermuteUint8x32: v.Op = OpAMD64VPERMB256 diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 0bd4a27606..1c2b22a7fe 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -740,8 +740,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index 438c1ef309..a576829e8f 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -74,4 +74,4 @@ commutative: false documentation: !string |- // NAME copies element zero of its (128-bit) input to all elements of - // the 512-bit output vector. + // the 512-bit output vector. \ No newline at end of file diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 2398e53415..3cdb9efe27 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -418,3 +418,18 @@ bits: 512 elemBits: $e base: $b + +# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX) +- go: Permute + asm: VPSHUFB + addDoc: !string |- + // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. + in: + - &128any + bits: 128 + go: $t + - bits: 128 + go: $t + name: indices + out: + - *128any \ No newline at end of file diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 7366aabd32..e0e580bd27 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -4155,15 +4155,17 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. // -// Asm: VPERMB, CPU Feature: AVX512VBMI -func (x Int8x16) Permute(indices Uint8x16) Int8x16 +// Asm: VPSHUFB, CPU Feature: AVX +func (x Int8x16) Permute(indices Int8x16) Int8x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. // -// Asm: VPERMB, CPU Feature: AVX512VBMI +// Asm: VPSHUFB, CPU Feature: AVX func (x Uint8x16) Permute(indices Uint8x16) Uint8x16 // Permute performs a full permutation of vector x using indices: