ssa.OpAMD64VPOR256,
ssa.OpAMD64VPORD512,
ssa.OpAMD64VPORQ512,
- ssa.OpAMD64VPERMB128,
+ ssa.OpAMD64VPSHUFB128,
ssa.OpAMD64VPERMB256,
ssa.OpAMD64VPERMB512,
ssa.OpAMD64VPERMW128,
ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512,
- ssa.OpAMD64VPERMBMasked128,
+ ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
- ssa.OpAMD64VPERMBMasked128,
+ ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
(PermuteFloat32x16 ...) => (VPERMPS512 ...)
(PermuteFloat64x4 ...) => (VPERMPD256 ...)
(PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPERMB128 ...)
+(PermuteInt8x16 ...) => (VPSHUFB128 ...)
(PermuteInt8x32 ...) => (VPERMB256 ...)
(PermuteInt8x64 ...) => (VPERMB512 ...)
(PermuteInt16x8 ...) => (VPERMW128 ...)
(PermuteInt32x16 ...) => (VPERMD512 ...)
(PermuteInt64x4 ...) => (VPERMQ256 ...)
(PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPERMB128 ...)
+(PermuteUint8x16 ...) => (VPSHUFB128 ...)
(PermuteUint8x32 ...) => (VPERMB256 ...)
(PermuteUint8x64 ...) => (VPERMB512 ...)
(PermuteUint16x8 ...) => (VPERMW128 ...)
{name: "VPDPWSSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
- {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
- {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDVWMasked128", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true},
+ {name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false},
OpAMD64VPDPWSSDSMasked128
OpAMD64VPDPWSSDSMasked256
OpAMD64VPDPWSSDSMasked512
- OpAMD64VPERMB128
OpAMD64VPERMB256
OpAMD64VPERMB512
- OpAMD64VPERMBMasked128
OpAMD64VPERMBMasked256
OpAMD64VPERMBMasked512
OpAMD64VPERMD256
OpAMD64VPSHRDVWMasked128
OpAMD64VPSHRDVWMasked256
OpAMD64VPSHRDVWMasked512
+ OpAMD64VPSHUFB128
+ OpAMD64VPSHUFBMasked128
OpAMD64VPSIGNB128
OpAMD64VPSIGNB256
OpAMD64VPSIGND128
},
},
},
- {
- name: "VPERMB128",
- argLen: 2,
- asm: x86.AVPERMB,
- reg: regInfo{
- inputs: []inputInfo{
- {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
{
name: "VPERMB256",
argLen: 2,
},
},
},
- {
- name: "VPERMBMasked128",
- argLen: 3,
- asm: x86.AVPERMB,
- reg: regInfo{
- inputs: []inputInfo{
- {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
- {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
- {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
- },
- outputs: []outputInfo{
- {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
- },
- },
- },
{
name: "VPERMBMasked256",
argLen: 3,
},
},
},
+ {
+ name: "VPSHUFB128",
+ argLen: 2,
+ asm: x86.AVPSHUFB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPSHUFBMasked128",
+ argLen: 3,
+ asm: x86.AVPSHUFB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
{
name: "VPSIGNB128",
argLen: 2,
v.Op = OpAMD64VPERMQ512
return true
case OpPermuteInt8x16:
- v.Op = OpAMD64VPERMB128
+ v.Op = OpAMD64VPSHUFB128
return true
case OpPermuteInt8x32:
v.Op = OpAMD64VPERMB256
v.Op = OpAMD64VPERMQ512
return true
case OpPermuteUint8x16:
- v.Op = OpAMD64VPERMB128
+ v.Op = OpAMD64VPSHUFB128
return true
case OpPermuteUint8x32:
v.Op = OpAMD64VPERMB256
addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
commutative: false
documentation: !string |-
// NAME copies element zero of its (128-bit) input to all elements of
- // the 512-bit output vector.
+ // the 512-bit output vector.
\ No newline at end of file
bits: 512
elemBits: $e
base: $b
+
+# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
+- go: Permute
+ asm: VPSHUFB
+ addDoc: !string |-
+ // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+ in:
+ - &128any
+ bits: 128
+ go: $t
+ - bits: 128
+ go: $t
+ name: indices
+ out:
+ - *128any
\ No newline at end of file
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Int8x16) Permute(indices Uint8x16) Int8x16
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Int8x16) Permute(indices Int8x16) Int8x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
+// Asm: VPSHUFB, CPU Feature: AVX
func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
// Permute performs a full permutation of vector x using indices: