This CL is generated by CL 692219.
Change-Id: I50fa919f1edc5c6505bc6d3238f65b37fc7628b5
Reviewed-on: https://go-review.googlesource.com/c/go/+/692156
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
ssa.OpAMD64VPCMPUQMasked512:
p = simdV2kkImm8(s, v)
- case ssa.OpAMD64VFMADD213PS128,
+ case ssa.OpAMD64VPDPWSSD128,
+ ssa.OpAMD64VPDPWSSD256,
+ ssa.OpAMD64VPDPWSSD512,
+ ssa.OpAMD64VFMADD213PS128,
ssa.OpAMD64VFMADD213PS256,
ssa.OpAMD64VFMADD213PS512,
ssa.OpAMD64VFMADD213PD128,
ssa.OpAMD64VFMSUBADD213PD128,
ssa.OpAMD64VFMSUBADD213PD256,
ssa.OpAMD64VFMSUBADD213PD512,
- ssa.OpAMD64VPDPWSSD128,
- ssa.OpAMD64VPDPWSSD256,
- ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
ssa.OpAMD64VPDPBUSD512:
p = simdV31ResultInArg0(s, v)
- case ssa.OpAMD64VFMADD213PSMasked128,
+ case ssa.OpAMD64VPDPWSSDMasked128,
+ ssa.OpAMD64VPDPWSSDMasked256,
+ ssa.OpAMD64VPDPWSSDMasked512,
+ ssa.OpAMD64VFMADD213PSMasked128,
ssa.OpAMD64VFMADD213PSMasked256,
ssa.OpAMD64VFMADD213PSMasked512,
ssa.OpAMD64VFMADD213PDMasked128,
ssa.OpAMD64VFMSUBADD213PDMasked128,
ssa.OpAMD64VFMSUBADD213PDMasked256,
ssa.OpAMD64VFMSUBADD213PDMasked512,
- ssa.OpAMD64VPDPWSSDMasked128,
- ssa.OpAMD64VPDPWSSDMasked256,
- ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPABSQMasked128,
ssa.OpAMD64VPABSQMasked256,
ssa.OpAMD64VPABSQMasked512,
+ ssa.OpAMD64VPDPWSSDMasked128,
+ ssa.OpAMD64VPDPWSSDMasked256,
+ ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VADDPSMasked128,
ssa.OpAMD64VADDPSMasked256,
ssa.OpAMD64VADDPSMasked512,
ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512,
- ssa.OpAMD64VPDPWSSDMasked128,
- ssa.OpAMD64VPDPWSSDMasked256,
- ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPMADDWDMasked128,
ssa.OpAMD64VPMADDWDMasked256,
ssa.OpAMD64VPMADDWDMasked512,
ssa.OpAMD64VPRORVQMasked128,
ssa.OpAMD64VPRORVQMasked256,
ssa.OpAMD64VPRORVQMasked512,
+ ssa.OpAMD64VPDPWSSDSMasked128,
+ ssa.OpAMD64VPDPWSSDSMasked256,
+ ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPADDSBMasked128,
ssa.OpAMD64VPADDSBMasked256,
ssa.OpAMD64VPADDSBMasked512,
ssa.OpAMD64VPADDSWMasked128,
ssa.OpAMD64VPADDSWMasked256,
ssa.OpAMD64VPADDSWMasked512,
- ssa.OpAMD64VPDPWSSDSMasked128,
- ssa.OpAMD64VPDPWSSDSMasked256,
- ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPSUBSBMasked128,
ssa.OpAMD64VPSUBSBMasked256,
ssa.OpAMD64VPSUBSBMasked512,
(AddUint64x2 ...) => (VPADDQ128 ...)
(AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...)
+(AddDotProdInt32x4 ...) => (VPDPWSSD128 ...)
+(AddDotProdInt32x8 ...) => (VPDPWSSD256 ...)
+(AddDotProdInt32x16 ...) => (VPDPWSSD512 ...)
+(AddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(AddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(AddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(AddMaskedFloat32x4 x y mask) => (VADDPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
(AddMaskedFloat32x8 x y mask) => (VADDPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
(AddMaskedFloat32x16 x y mask) => (VADDPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
(PairDotProdInt16x8 ...) => (VPMADDWD128 ...)
(PairDotProdInt16x16 ...) => (VPMADDWD256 ...)
(PairDotProdInt16x32 ...) => (VPMADDWD512 ...)
-(PairDotProdAccumulateInt32x4 ...) => (VPDPWSSD128 ...)
-(PairDotProdAccumulateInt32x8 ...) => (VPDPWSSD256 ...)
-(PairDotProdAccumulateInt32x16 ...) => (VPDPWSSD512 ...)
-(PairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
-(PairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
-(PairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(SaturatedAddUint16x8 ...) => (VPADDSW128 ...)
(SaturatedAddUint16x16 ...) => (VPADDSW256 ...)
(SaturatedAddUint16x32 ...) => (VPADDSW512 ...)
+(SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...)
+(SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...)
+(SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...)
+(SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedPairDotProdAccumulateInt32x4 ...) => (VPDPWSSDS128 ...)
-(SaturatedPairDotProdAccumulateInt32x8 ...) => (VPDPWSSDS256 ...)
-(SaturatedPairDotProdAccumulateInt32x16 ...) => (VPDPWSSDS512 ...)
-(SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
-(SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
-(SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...)
{name: "AbsoluteMaskedInt64x2", argLength: 2, commutative: false},
{name: "AbsoluteMaskedInt64x4", argLength: 2, commutative: false},
{name: "AbsoluteMaskedInt64x8", argLength: 2, commutative: false},
+ {name: "AddDotProdInt32x4", argLength: 3, commutative: false},
+ {name: "AddDotProdInt32x8", argLength: 3, commutative: false},
+ {name: "AddDotProdInt32x16", argLength: 3, commutative: false},
+ {name: "AddDotProdMaskedInt32x4", argLength: 4, commutative: false},
+ {name: "AddDotProdMaskedInt32x8", argLength: 4, commutative: false},
+ {name: "AddDotProdMaskedInt32x16", argLength: 4, commutative: false},
{name: "AddFloat32x4", argLength: 2, commutative: true},
{name: "AddFloat32x8", argLength: 2, commutative: true},
{name: "AddFloat32x16", argLength: 2, commutative: true},
{name: "OrUint64x2", argLength: 2, commutative: true},
{name: "OrUint64x4", argLength: 2, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true},
- {name: "PairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
- {name: "PairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
- {name: "PairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
- {name: "PairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
- {name: "PairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
- {name: "PairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
{name: "PairDotProdInt16x8", argLength: 2, commutative: false},
{name: "PairDotProdInt16x16", argLength: 2, commutative: false},
{name: "PairDotProdInt16x32", argLength: 2, commutative: false},
{name: "RoundFloat32x8", argLength: 1, commutative: false},
{name: "RoundFloat64x2", argLength: 1, commutative: false},
{name: "RoundFloat64x4", argLength: 1, commutative: false},
+ {name: "SaturatedAddDotProdInt32x4", argLength: 3, commutative: false},
+ {name: "SaturatedAddDotProdInt32x8", argLength: 3, commutative: false},
+ {name: "SaturatedAddDotProdInt32x16", argLength: 3, commutative: false},
+ {name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false},
+ {name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false},
+ {name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false},
{name: "SaturatedAddInt8x16", argLength: 2, commutative: true},
{name: "SaturatedAddInt8x32", argLength: 2, commutative: true},
{name: "SaturatedAddInt8x64", argLength: 2, commutative: true},
{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
- {name: "SaturatedPairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
- {name: "SaturatedPairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
- {name: "SaturatedPairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
- {name: "SaturatedPairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
- {name: "SaturatedPairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
- {name: "SaturatedPairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
{name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false},
{name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false},
{name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false},
OpAbsoluteMaskedInt64x2
OpAbsoluteMaskedInt64x4
OpAbsoluteMaskedInt64x8
+ OpAddDotProdInt32x4
+ OpAddDotProdInt32x8
+ OpAddDotProdInt32x16
+ OpAddDotProdMaskedInt32x4
+ OpAddDotProdMaskedInt32x8
+ OpAddDotProdMaskedInt32x16
OpAddFloat32x4
OpAddFloat32x8
OpAddFloat32x16
OpOrUint64x2
OpOrUint64x4
OpOrUint64x8
- OpPairDotProdAccumulateInt32x4
- OpPairDotProdAccumulateInt32x8
- OpPairDotProdAccumulateInt32x16
- OpPairDotProdAccumulateMaskedInt32x4
- OpPairDotProdAccumulateMaskedInt32x8
- OpPairDotProdAccumulateMaskedInt32x16
OpPairDotProdInt16x8
OpPairDotProdInt16x16
OpPairDotProdInt16x32
OpRoundFloat32x8
OpRoundFloat64x2
OpRoundFloat64x4
+ OpSaturatedAddDotProdInt32x4
+ OpSaturatedAddDotProdInt32x8
+ OpSaturatedAddDotProdInt32x16
+ OpSaturatedAddDotProdMaskedInt32x4
+ OpSaturatedAddDotProdMaskedInt32x8
+ OpSaturatedAddDotProdMaskedInt32x16
OpSaturatedAddInt8x16
OpSaturatedAddInt8x32
OpSaturatedAddInt8x64
OpSaturatedAddUint16x8
OpSaturatedAddUint16x16
OpSaturatedAddUint16x32
- OpSaturatedPairDotProdAccumulateInt32x4
- OpSaturatedPairDotProdAccumulateInt32x8
- OpSaturatedPairDotProdAccumulateInt32x16
- OpSaturatedPairDotProdAccumulateMaskedInt32x4
- OpSaturatedPairDotProdAccumulateMaskedInt32x8
- OpSaturatedPairDotProdAccumulateMaskedInt32x16
OpSaturatedPairwiseAddInt16x8
OpSaturatedPairwiseAddInt16x16
OpSaturatedPairwiseSubInt16x8
argLen: 2,
generic: true,
},
+ {
+ name: "AddDotProdInt32x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "AddDotProdInt32x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "AddDotProdInt32x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "AddDotProdMaskedInt32x4",
+ argLen: 4,
+ generic: true,
+ },
+ {
+ name: "AddDotProdMaskedInt32x8",
+ argLen: 4,
+ generic: true,
+ },
+ {
+ name: "AddDotProdMaskedInt32x16",
+ argLen: 4,
+ generic: true,
+ },
{
name: "AddFloat32x4",
argLen: 2,
commutative: true,
generic: true,
},
- {
- name: "PairDotProdAccumulateInt32x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "PairDotProdAccumulateInt32x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "PairDotProdAccumulateInt32x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "PairDotProdAccumulateMaskedInt32x4",
- argLen: 4,
- generic: true,
- },
- {
- name: "PairDotProdAccumulateMaskedInt32x8",
- argLen: 4,
- generic: true,
- },
- {
- name: "PairDotProdAccumulateMaskedInt32x16",
- argLen: 4,
- generic: true,
- },
{
name: "PairDotProdInt16x8",
argLen: 2,
argLen: 1,
generic: true,
},
+ {
+ name: "SaturatedAddDotProdInt32x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "SaturatedAddDotProdInt32x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "SaturatedAddDotProdInt32x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "SaturatedAddDotProdMaskedInt32x4",
+ argLen: 4,
+ generic: true,
+ },
+ {
+ name: "SaturatedAddDotProdMaskedInt32x8",
+ argLen: 4,
+ generic: true,
+ },
+ {
+ name: "SaturatedAddDotProdMaskedInt32x16",
+ argLen: 4,
+ generic: true,
+ },
{
name: "SaturatedAddInt8x16",
argLen: 2,
commutative: true,
generic: true,
},
- {
- name: "SaturatedPairDotProdAccumulateInt32x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "SaturatedPairDotProdAccumulateInt32x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "SaturatedPairDotProdAccumulateInt32x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "SaturatedPairDotProdAccumulateMaskedInt32x4",
- argLen: 4,
- generic: true,
- },
- {
- name: "SaturatedPairDotProdAccumulateMaskedInt32x8",
- argLen: 4,
- generic: true,
- },
- {
- name: "SaturatedPairDotProdAccumulateMaskedInt32x16",
- argLen: 4,
- generic: true,
- },
{
name: "SaturatedPairwiseAddInt16x8",
argLen: 2,
case OpAdd8:
v.Op = OpAMD64ADDL
return true
+ case OpAddDotProdInt32x16:
+ v.Op = OpAMD64VPDPWSSD512
+ return true
+ case OpAddDotProdInt32x4:
+ v.Op = OpAMD64VPDPWSSD128
+ return true
+ case OpAddDotProdInt32x8:
+ v.Op = OpAMD64VPDPWSSD256
+ return true
+ case OpAddDotProdMaskedInt32x16:
+ return rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v)
+ case OpAddDotProdMaskedInt32x4:
+ return rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v)
+ case OpAddDotProdMaskedInt32x8:
+ return rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v)
case OpAddFloat32x16:
v.Op = OpAMD64VADDPS512
return true
case OpOrUint8x32:
v.Op = OpAMD64VPOR256
return true
- case OpPairDotProdAccumulateInt32x16:
- v.Op = OpAMD64VPDPWSSD512
- return true
- case OpPairDotProdAccumulateInt32x4:
- v.Op = OpAMD64VPDPWSSD128
- return true
- case OpPairDotProdAccumulateInt32x8:
- v.Op = OpAMD64VPDPWSSD256
- return true
- case OpPairDotProdAccumulateMaskedInt32x16:
- return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v)
- case OpPairDotProdAccumulateMaskedInt32x4:
- return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v)
- case OpPairDotProdAccumulateMaskedInt32x8:
- return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v)
case OpPairDotProdInt16x16:
v.Op = OpAMD64VPMADDWD256
return true
return rewriteValueAMD64_OpRsh8x64(v)
case OpRsh8x8:
return rewriteValueAMD64_OpRsh8x8(v)
+ case OpSaturatedAddDotProdInt32x16:
+ v.Op = OpAMD64VPDPWSSDS512
+ return true
+ case OpSaturatedAddDotProdInt32x4:
+ v.Op = OpAMD64VPDPWSSDS128
+ return true
+ case OpSaturatedAddDotProdInt32x8:
+ v.Op = OpAMD64VPDPWSSDS256
+ return true
+ case OpSaturatedAddDotProdMaskedInt32x16:
+ return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v)
+ case OpSaturatedAddDotProdMaskedInt32x4:
+ return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v)
+ case OpSaturatedAddDotProdMaskedInt32x8:
+ return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v)
case OpSaturatedAddInt16x16:
v.Op = OpAMD64VPADDSW256
return true
case OpSaturatedAddUint8x64:
v.Op = OpAMD64VPADDSB512
return true
- case OpSaturatedPairDotProdAccumulateInt32x16:
- v.Op = OpAMD64VPDPWSSDS512
- return true
- case OpSaturatedPairDotProdAccumulateInt32x4:
- v.Op = OpAMD64VPDPWSSDS128
- return true
- case OpSaturatedPairDotProdAccumulateInt32x8:
- v.Op = OpAMD64VPDPWSSDS256
- return true
- case OpSaturatedPairDotProdAccumulateMaskedInt32x16:
- return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v)
- case OpSaturatedPairDotProdAccumulateMaskedInt32x4:
- return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v)
- case OpSaturatedPairDotProdAccumulateMaskedInt32x8:
- return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v)
case OpSaturatedPairwiseAddInt16x16:
v.Op = OpAMD64VPHADDSW256
return true
return true
}
}
+func rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (AddDotProdMaskedInt32x16 x y z mask)
+ // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (AddDotProdMaskedInt32x4 x y z mask)
+ // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (AddDotProdMaskedInt32x8 x y z mask)
+ // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpAddMaskedFloat32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
return true
}
}
-func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (PairDotProdAccumulateMaskedInt32x16 x y z mask)
- // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDMasked512)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
-func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (PairDotProdAccumulateMaskedInt32x4 x y z mask)
- // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDMasked128)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
-func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (PairDotProdAccumulateMaskedInt32x8 x y z mask)
- // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDMasked256)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
func rewriteValueAMD64_OpPairDotProdMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
}
return false
}
+func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (SaturatedAddDotProdMaskedInt32x16 x y z mask)
+ // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDSMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (SaturatedAddDotProdMaskedInt32x4 x y z mask)
+ // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDSMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool {
+ v_3 := v.Args[3]
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (SaturatedAddDotProdMaskedInt32x8 x y z mask)
+ // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ y := v_1
+ z := v_2
+ mask := v_3
+ v.reset(OpAMD64VPDPWSSDSMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(x, y, z, v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
return true
}
}
-func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask)
- // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDSMasked512)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
-func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask)
- // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDSMasked128)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
-func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v *Value) bool {
- v_3 := v.Args[3]
- v_2 := v.Args[2]
- v_1 := v.Args[1]
- v_0 := v.Args[0]
- b := v.Block
- // match: (SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask)
- // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
- for {
- x := v_0
- y := v_1
- z := v_2
- mask := v_3
- v.reset(OpAMD64VPDPWSSDSMasked256)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(x, y, z, v0)
- return true
- }
-}
func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.AddDotProd", opLen3(ssa.OpAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.AddDotProd", opLen3(ssa.OpAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.AddDotProd", opLen3(ssa.OpAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.AddMasked", opLen3(ssa.OpAddMaskedFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.AddMasked", opLen3(ssa.OpAddMaskedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.AddMasked", opLen3(ssa.OpAddMaskedFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.PairDotProd", opLen2(ssa.OpPairDotProdInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProd", opLen2(ssa.OpPairDotProdInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProd", opLen2(ssa.OpPairDotProdInt16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
// Asm: VPADDQ, CPU Feature: AVX512F
func (x Uint64x8) Add(y Uint64x8) Uint64x8
+/* AddDotProd */
+
+// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVXVNNI
+func (x Int32x4) AddDotProd(y Int16x8, z Int16x8) Int32x4
+
+// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVXVNNI
+func (x Int32x8) AddDotProd(y Int16x16, z Int16x16) Int32x8
+
+// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
+func (x Int32x16) AddDotProd(y Int16x32, z Int16x32) Int32x16
+
+/* AddDotProdMasked */
+
+// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
+func (x Int32x4) AddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
+
+// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
+func (x Int32x8) AddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
+
+// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
+func (x Int32x16) AddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
+
/* AddMasked */
// AddMasked adds corresponding elements of two vectors.
// Asm: VPMADDWD, CPU Feature: AVX512BW
func (x Int16x32) PairDotProd(y Int16x32) Int32x16
-/* PairDotProdAccumulate */
-
-// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSD, CPU Feature: AVXVNNI
-func (x Int16x8) PairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4
-
-// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSD, CPU Feature: AVXVNNI
-func (x Int16x16) PairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8
-
-// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
-func (x Int16x32) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
-
-/* PairDotProdAccumulateMasked */
-
-// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
-func (x Int16x8) PairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4
-
-// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
-func (x Int16x16) PairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8
-
-// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
-func (x Int16x32) PairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16
-
/* PairDotProdMasked */
// PairDotProdMasked multiplies the elements and add the pairs together,
// Asm: VPADDSW, CPU Feature: AVX512BW
func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32
+/* SaturatedAddDotProd */
+
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
+func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4
+
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
+func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8
+
+// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16
+
+/* SaturatedAddDotProdMasked */
+
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
+
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
+
+// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
+func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
+
/* SaturatedAddMasked */
// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
// Asm: VPADDSW, CPU Feature: AVX512BW
func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32
-/* SaturatedPairDotProdAccumulate */
-
-// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
-func (x Int16x8) SaturatedPairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4
-
-// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
-func (x Int16x16) SaturatedPairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8
-
-// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int16x32) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
-
-/* SaturatedPairDotProdAccumulateMasked */
-
-// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int16x8) SaturatedPairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4
-
-// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int16x16) SaturatedPairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8
-
-// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
-func (x Int16x32) SaturatedPairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16
-
/* SaturatedPairwiseAdd */
// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.
z := simd.LoadInt32x4Slice([]int32{3, 3, 3, 3})
want := []int32{11, 11, 11, 11}
got := make([]int32, 4)
- z = x.PairDotProdAccumulate(x, z)
+ z = z.AddDotProd(x, x)
z.StoreSlice(got)
for i := range 4 {
if got[i] != want[i] {