ssa.OpAMD64VPSUBUSW128,
ssa.OpAMD64VPSUBUSW256,
ssa.OpAMD64VPSUBUSW512,
+ ssa.OpAMD64VPSADBW128,
+ ssa.OpAMD64VPSADBW256,
+ ssa.OpAMD64VPSADBW512,
ssa.OpAMD64VPXOR128,
ssa.OpAMD64VPXOR256,
ssa.OpAMD64VPXORD512,
(SubSaturatedUint16x8 ...) => (VPSUBUSW128 ...)
(SubSaturatedUint16x16 ...) => (VPSUBUSW256 ...)
(SubSaturatedUint16x32 ...) => (VPSUBUSW512 ...)
+(SumAbsDiffUint8x16 ...) => (VPSADBW128 ...)
+(SumAbsDiffUint8x32 ...) => (VPSADBW256 ...)
+(SumAbsDiffUint8x64 ...) => (VPSADBW512 ...)
(TruncFloat32x4 x) => (VROUNDPS128 [3] x)
(TruncFloat32x8 x) => (VROUNDPS256 [3] x)
(TruncFloat64x2 x) => (VROUNDPD128 [3] x)
{name: "VPRORVQMasked128", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPRORVQMasked256", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPRORVQMasked512", argLength: 3, reg: w2kw, asm: "VPRORVQ", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPSADBW128", argLength: 2, reg: v21, asm: "VPSADBW", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPSADBW256", argLength: 2, reg: v21, asm: "VPSADBW", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPSADBW512", argLength: 2, reg: w21, asm: "VPSADBW", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHLDVD128", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHLDVD256", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHLDVD512", argLength: 3, reg: w31, asm: "VPSHLDVD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "SubUint64x2", argLength: 2, commutative: false},
{name: "SubUint64x4", argLength: 2, commutative: false},
{name: "SubUint64x8", argLength: 2, commutative: false},
+ {name: "SumAbsDiffUint8x16", argLength: 2, commutative: false},
+ {name: "SumAbsDiffUint8x32", argLength: 2, commutative: false},
+ {name: "SumAbsDiffUint8x64", argLength: 2, commutative: false},
{name: "TruncFloat32x4", argLength: 1, commutative: false},
{name: "TruncFloat32x8", argLength: 1, commutative: false},
{name: "TruncFloat64x2", argLength: 1, commutative: false},
OpAMD64VPRORVQMasked128
OpAMD64VPRORVQMasked256
OpAMD64VPRORVQMasked512
+ OpAMD64VPSADBW128
+ OpAMD64VPSADBW256
+ OpAMD64VPSADBW512
OpAMD64VPSHLDVD128
OpAMD64VPSHLDVD256
OpAMD64VPSHLDVD512
OpSubUint64x2
OpSubUint64x4
OpSubUint64x8
+ OpSumAbsDiffUint8x16
+ OpSumAbsDiffUint8x32
+ OpSumAbsDiffUint8x64
OpTruncFloat32x4
OpTruncFloat32x8
OpTruncFloat64x2
},
},
},
+ {
+ name: "VPSADBW128",
+ argLen: 2,
+ asm: x86.AVPSADBW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPSADBW256",
+ argLen: 2,
+ asm: x86.AVPSADBW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPSADBW512",
+ argLen: 2,
+ asm: x86.AVPSADBW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPSHLDVD128",
argLen: 3,
argLen: 2,
generic: true,
},
+ {
+ name: "SumAbsDiffUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "SumAbsDiffUint8x32",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "SumAbsDiffUint8x64",
+ argLen: 2,
+ generic: true,
+ },
{
name: "TruncFloat32x4",
argLen: 1,
case OpSubUint8x64:
v.Op = OpAMD64VPSUBB512
return true
+ case OpSumAbsDiffUint8x16:
+ v.Op = OpAMD64VPSADBW128
+ return true
+ case OpSumAbsDiffUint8x32:
+ v.Op = OpAMD64VPSADBW256
+ return true
+ case OpSumAbsDiffUint8x64:
+ v.Op = OpAMD64VPSADBW512
+ return true
case OpTailCall:
v.Op = OpAMD64CALLtail
return true
addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint8x32.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x32, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint8x64.SumAbsDiff", opLen2(ssa.OpSumAbsDiffUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
commutative: false
documentation: !string |-
// NAME performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+- go: SumAbsDiff
+ commutative: false
+ documentation: !string |-
+ // NAME sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+ // be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+ // This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
- *fma_op
- *fma_op
out:
- - *fma_op
\ No newline at end of file
+ - *fma_op
+- go: SumAbsDiff
+ asm: "VPSADBW"
+ in:
+ - go: $t
+ base: uint
+ - go: $t
+ base: uint
+ out:
+ - go: $t2
+ base: uint
\ No newline at end of file
// Asm: VPSUBUSW, CPU Feature: AVX512
func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
+/* SumAbsDiff */
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX
+func (x Uint8x16) SumAbsDiff(y Uint8x16) Uint16x8
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX2
+func (x Uint8x32) SumAbsDiff(y Uint8x32) Uint16x16
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 8*n-th element contains the sum of the n-th input group.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX512
+func (x Uint8x64) SumAbsDiff(y Uint8x64) Uint16x32
+
/* Trunc */
// Trunc truncates elements towards zero.