From 6f7a1164e797f694c535ebf5f2c9722845a732cd Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Wed, 23 Jul 2025 07:37:14 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: support store to bits for mask This CL is partially generated by CL 689775. Change-Id: I0c36fd2a44706c88db1a1d5ea4a6d0b9f891d85f Reviewed-on: https://go-review.googlesource.com/c/go/+/689795 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 32 +- src/cmd/compile/internal/amd64/ssa.go | 4 +- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 16 + src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 4 +- .../compile/internal/ssa/_gen/genericOps.go | 13 + .../compile/internal/ssa/_gen/simdAMD64.rules | 28 +- .../compile/internal/ssa/_gen/simdAMD64ops.go | 44 +- .../internal/ssa/_gen/simdgenericOps.go | 34 +- src/cmd/compile/internal/ssa/opGen.go | 635 ++++++++++++------ src/cmd/compile/internal/ssa/rewriteAMD64.go | 316 +++++++-- src/cmd/compile/internal/ssagen/intrinsics.go | 17 + .../compile/internal/ssagen/simdintrinsics.go | 36 +- src/simd/ops_amd64.go | 226 +++---- src/simd/simd_test.go | 18 +- src/simd/types_amd64.go | 144 +++- 15 files changed, 1118 insertions(+), 449 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 67179ef12d..f374cd25d0 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -24,8 +24,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ512, - ssa.OpAMD64VRCP14PS128, - ssa.OpAMD64VRCP14PS256, + ssa.OpAMD64VRCPPS128, + ssa.OpAMD64VRCPPS256, ssa.OpAMD64VRCP14PS512, ssa.OpAMD64VRCP14PD128, ssa.OpAMD64VRCP14PD256, @@ -335,6 +335,16 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPXORQ512: p = simdV21(s, v) + case ssa.OpAMD64VPCMPEQB512, + ssa.OpAMD64VPCMPEQW512, + ssa.OpAMD64VPCMPEQD512, + ssa.OpAMD64VPCMPEQQ512, + ssa.OpAMD64VPCMPGTB512, + ssa.OpAMD64VPCMPGTW512, + ssa.OpAMD64VPCMPGTD512, + ssa.OpAMD64VPCMPGTQ512: + p = simdV2k(s, v) + case ssa.OpAMD64VADDPSMasked128, ssa.OpAMD64VADDPSMasked256, ssa.OpAMD64VADDPSMasked512, @@ -733,30 +743,30 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VCMPPS512, ssa.OpAMD64VCMPPD512, - ssa.OpAMD64VPCMPB512, - ssa.OpAMD64VPCMPW512, - ssa.OpAMD64VPCMPD512, - ssa.OpAMD64VPCMPQ512, - ssa.OpAMD64VPCMPUB512, - ssa.OpAMD64VPCMPUW512, - ssa.OpAMD64VPCMPUD512, - ssa.OpAMD64VPCMPUQ512, ssa.OpAMD64VPCMPUB128, ssa.OpAMD64VPCMPUB256, + ssa.OpAMD64VPCMPUB512, ssa.OpAMD64VPCMPUW128, ssa.OpAMD64VPCMPUW256, + ssa.OpAMD64VPCMPUW512, ssa.OpAMD64VPCMPUD128, ssa.OpAMD64VPCMPUD256, + ssa.OpAMD64VPCMPUD512, ssa.OpAMD64VPCMPUQ128, ssa.OpAMD64VPCMPUQ256, + ssa.OpAMD64VPCMPUQ512, ssa.OpAMD64VPCMPB128, ssa.OpAMD64VPCMPB256, + ssa.OpAMD64VPCMPB512, ssa.OpAMD64VPCMPW128, ssa.OpAMD64VPCMPW256, + ssa.OpAMD64VPCMPW512, ssa.OpAMD64VPCMPD128, ssa.OpAMD64VPCMPD256, + ssa.OpAMD64VPCMPD512, ssa.OpAMD64VPCMPQ128, - ssa.OpAMD64VPCMPQ256: + ssa.OpAMD64VPCMPQ256, + ssa.OpAMD64VPCMPQ512: p = simdV2kImm8(s, v) case ssa.OpAMD64VCMPPSMasked128, diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 0fafd69f54..7338c16cda 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1468,10 +1468,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { ssagen.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = simdOrMaskReg(v) - case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512: + case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG - p.From.Reg = simdReg(v.Args[1]) + p.From.Reg = simdOrMaskReg(v.Args[1]) p.To.Type = obj.TYPE_MEM p.To.Reg = v.Args[0].Reg() ssagen.AddAux(&p.To, v) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index bb7513795d..5a21c95df9 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1698,6 +1698,22 @@ (LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) (LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) +(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM val) mem) +(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM val) mem) +(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM val) mem) + +(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM val) mem) +(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM val) mem) +(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM val) mem) + +(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM val) mem) +(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM val) mem) +(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM val) mem) + +(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM val) mem) +(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM val) mem) +(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM val) mem) + (Load ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index ec335f67f8..cd4b5b2a06 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -234,7 +234,8 @@ func init() { wfpw = regInfo{inputs: []regMask{w, fp}, outputs: wonly} wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly} - kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} + kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} + kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}} prefreg = regInfo{inputs: []regMask{gpspsbg}} ) @@ -1318,6 +1319,7 @@ func init() { {name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, } var AMD64blocks = []blockData{ diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 6257396a6f..716fe9b881 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -678,6 +678,19 @@ var genericOps = []opData{ {name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem {name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem {name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem + + {name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index e5f17bdb1b..fb153acf66 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -152,8 +152,8 @@ (AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM mask)) (AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM mask)) (AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM mask)) -(ApproximateReciprocalFloat32x4 ...) => (VRCP14PS128 ...) -(ApproximateReciprocalFloat32x8 ...) => (VRCP14PS256 ...) +(ApproximateReciprocalFloat32x4 ...) => (VRCPPS128 ...) +(ApproximateReciprocalFloat32x8 ...) => (VRCPPS256 ...) (ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...) (ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...) (ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...) @@ -305,28 +305,28 @@ (EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y)) (EqualInt8x16 ...) => (VPCMPEQB128 ...) (EqualInt8x32 ...) => (VPCMPEQB256 ...) -(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [0] x y)) +(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y)) (EqualInt16x8 ...) => (VPCMPEQW128 ...) (EqualInt16x16 ...) => (VPCMPEQW256 ...) -(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [0] x y)) +(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y)) (EqualInt32x4 ...) => (VPCMPEQD128 ...) (EqualInt32x8 ...) => (VPCMPEQD256 ...) -(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [0] x y)) +(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y)) (EqualInt64x2 ...) => (VPCMPEQQ128 ...) (EqualInt64x4 ...) => (VPCMPEQQ256 ...) -(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [0] x y)) +(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y)) (EqualUint8x16 ...) => (VPCMPEQB128 ...) (EqualUint8x32 ...) => (VPCMPEQB256 ...) -(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [0] x y)) +(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y)) (EqualUint16x8 ...) => (VPCMPEQW128 ...) (EqualUint16x16 ...) => (VPCMPEQW256 ...) -(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [0] x y)) +(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y)) (EqualUint32x4 ...) => (VPCMPEQD128 ...) (EqualUint32x8 ...) => (VPCMPEQD256 ...) -(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [0] x y)) +(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y)) (EqualUint64x2 ...) => (VPCMPEQQ128 ...) (EqualUint64x4 ...) => (VPCMPEQQ256 ...) -(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y)) +(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y)) (EqualMaskedFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [0] x y (VPMOVVec32x4ToM mask))) (EqualMaskedFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [0] x y (VPMOVVec32x8ToM mask))) (EqualMaskedFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [0] x y (VPMOVVec32x16ToM mask))) @@ -453,16 +453,16 @@ (GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [14] x y)) (GreaterInt8x16 ...) => (VPCMPGTB128 ...) (GreaterInt8x32 ...) => (VPCMPGTB256 ...) -(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [14] x y)) +(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPGTB512 x y)) (GreaterInt16x8 ...) => (VPCMPGTW128 ...) (GreaterInt16x16 ...) => (VPCMPGTW256 ...) -(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [14] x y)) +(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPGTW512 x y)) (GreaterInt32x4 ...) => (VPCMPGTD128 ...) (GreaterInt32x8 ...) => (VPCMPGTD256 ...) -(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [14] x y)) +(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPGTD512 x y)) (GreaterInt64x2 ...) => (VPCMPGTQ128 ...) (GreaterInt64x4 ...) => (VPCMPGTQ256 ...) -(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [14] x y)) +(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPGTQ512 x y)) (GreaterUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [14] x y)) (GreaterUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [14] x y)) (GreaterUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [14] x y)) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index a7a3c9715c..5a51e4400a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -33,7 +33,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VADDPS128", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VADDPSMasked128", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false}, - {name: "VRCP14PS128", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VRCPPS128", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRCP14PSMasked128", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRSQRTPS128", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRSQRT14PSMasked128", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -63,7 +63,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VADDPS256", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VADDPSMasked256", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VRCP14PS256", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VRCPPS256", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRCP14PSMasked256", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRSQRTPS256", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRSQRT14PSMasked256", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -224,6 +224,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDWMasked512", argLength: 3, reg: w2kw, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCMPEQW512", argLength: 2, reg: w2k, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPGTW512", argLength: 2, reg: w2k, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPMAXSW512", argLength: 2, reg: w21, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSWMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSW512", argLength: 2, reg: w21, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -305,6 +307,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPANDND512", argLength: 2, reg: w21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDNDMasked512", argLength: 3, reg: w2kw, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSDMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCMPEQD512", argLength: 2, reg: w2k, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPGTD512", argLength: 2, reg: w2k, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPMAXSD512", argLength: 2, reg: w21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSD512", argLength: 2, reg: w21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -526,6 +530,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPANDNQ512", argLength: 2, reg: w21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDNQMasked512", argLength: 3, reg: w2kw, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSQMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCMPEQQ512", argLength: 2, reg: w2k, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPGTQ512", argLength: 2, reg: w2k, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPMAXSQ512", argLength: 2, reg: w21, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSQMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSQ512", argLength: 2, reg: w21, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -611,6 +617,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDBMasked512", argLength: 3, reg: w2kw, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSBMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPGTB512", argLength: 2, reg: w2k, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPMAXSB512", argLength: 2, reg: w21, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSB512", argLength: 2, reg: w21, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -692,10 +700,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true}, - {name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -705,12 +713,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true}, - {name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -735,10 +743,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -759,8 +767,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true}, - {name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -858,8 +866,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHLDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDW256", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPSHLDW512", argLength: 2, reg: w21, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHLDWMasked512", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHRDW512", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -872,8 +880,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHLDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHRDW128", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHRDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, - {name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLDMasked512", argLength: 2, reg: wkw, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPRORD512", argLength: 1, reg: w11, asm: "VPRORD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -926,8 +934,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHLDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDQ256", argLength: 2, reg: w21, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPROLQ512", argLength: 1, reg: w11, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLQMasked512", argLength: 2, reg: wkw, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPRORQ512", argLength: 1, reg: w11, asm: "VPRORQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -944,16 +952,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPCMPB256", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, - {name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUW256", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, - {name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUW128", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, - {name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUD128", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, @@ -962,8 +970,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPCMPUQ128", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQ256", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, - {name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VGF2P8AFFINEQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEINVQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -976,11 +984,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VGF2P8AFFINEINVQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPCMPUB256", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, - {name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VGF2P8AFFINEQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEINVQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEINVQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, } } diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index c8fe1e9eee..7b016b517d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -912,10 +912,10 @@ func simdGenericOps() []opData { {name: "PermuteUint16x16", argLength: 2, commutative: false}, {name: "Permute2Uint16x16", argLength: 3, commutative: false}, {name: "Permute2Int16x16", argLength: 3, commutative: false}, - {name: "Permute2MaskedInt16x16", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x16", argLength: 4, commutative: false}, - {name: "PermuteMaskedUint16x16", argLength: 3, commutative: false}, + {name: "Permute2MaskedInt16x16", argLength: 4, commutative: false}, {name: "PermuteMaskedInt16x16", argLength: 3, commutative: false}, + {name: "PermuteMaskedUint16x16", argLength: 3, commutative: false}, {name: "PopCountUint16x16", argLength: 1, commutative: false}, {name: "PopCountMaskedUint16x16", argLength: 2, commutative: false}, {name: "SaturatedAddUint16x16", argLength: 2, commutative: true}, @@ -966,8 +966,8 @@ func simdGenericOps() []opData { {name: "Permute2Int16x32", argLength: 3, commutative: false}, {name: "Permute2MaskedUint16x32", argLength: 4, commutative: false}, {name: "Permute2MaskedInt16x32", argLength: 4, commutative: false}, - {name: "PermuteMaskedUint16x32", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x32", argLength: 3, commutative: false}, + {name: "PermuteMaskedUint16x32", argLength: 3, commutative: false}, {name: "PopCountUint16x32", argLength: 1, commutative: false}, {name: "PopCountMaskedUint16x32", argLength: 2, commutative: false}, {name: "SaturatedAddUint16x32", argLength: 2, commutative: true}, @@ -1018,12 +1018,12 @@ func simdGenericOps() []opData { {name: "PairwiseSubUint16x8", argLength: 2, commutative: false}, {name: "PermuteInt16x8", argLength: 2, commutative: false}, {name: "PermuteUint16x8", argLength: 2, commutative: false}, - {name: "Permute2Int16x8", argLength: 3, commutative: false}, {name: "Permute2Uint16x8", argLength: 3, commutative: false}, + {name: "Permute2Int16x8", argLength: 3, commutative: false}, {name: "Permute2MaskedInt16x8", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x8", argLength: 4, commutative: false}, - {name: "PermuteMaskedUint16x8", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x8", argLength: 3, commutative: false}, + {name: "PermuteMaskedUint16x8", argLength: 3, commutative: false}, {name: "PopCountUint16x8", argLength: 1, commutative: false}, {name: "PopCountMaskedUint16x8", argLength: 2, commutative: false}, {name: "SaturatedAddUint16x8", argLength: 2, commutative: true}, @@ -1070,17 +1070,17 @@ func simdGenericOps() []opData { {name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true}, {name: "OrUint32x16", argLength: 2, commutative: true}, {name: "OrMaskedUint32x16", argLength: 3, commutative: true}, - {name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteInt32x16", argLength: 2, commutative: false}, + {name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteUint32x16", argLength: 2, commutative: false}, {name: "Permute2Uint32x16", argLength: 3, commutative: false}, {name: "Permute2Float32x16", argLength: 3, commutative: false}, {name: "Permute2Int32x16", argLength: 3, commutative: false}, + {name: "Permute2MaskedUint32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedInt32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false}, - {name: "Permute2MaskedUint32x16", argLength: 4, commutative: false}, - {name: "PermuteMaskedInt32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false}, + {name: "PermuteMaskedInt32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedUint32x16", argLength: 3, commutative: false}, {name: "PopCountUint32x16", argLength: 1, commutative: false}, {name: "PopCountMaskedUint32x16", argLength: 2, commutative: false}, @@ -1307,15 +1307,15 @@ func simdGenericOps() []opData { {name: "PermuteUint64x4", argLength: 2, commutative: false}, {name: "PermuteInt64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false}, - {name: "Permute2Float64x4", argLength: 3, commutative: false}, - {name: "Permute2Int64x4", argLength: 3, commutative: false}, {name: "Permute2Uint64x4", argLength: 3, commutative: false}, - {name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false}, + {name: "Permute2Int64x4", argLength: 3, commutative: false}, + {name: "Permute2Float64x4", argLength: 3, commutative: false}, {name: "Permute2MaskedUint64x4", argLength: 4, commutative: false}, + {name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x4", argLength: 4, commutative: false}, + {name: "PermuteMaskedUint64x4", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false}, {name: "PermuteMaskedInt64x4", argLength: 3, commutative: false}, - {name: "PermuteMaskedUint64x4", argLength: 3, commutative: false}, {name: "PopCountUint64x4", argLength: 1, commutative: false}, {name: "PopCountMaskedUint64x4", argLength: 2, commutative: false}, {name: "RotateLeftUint64x4", argLength: 2, commutative: false}, @@ -1365,18 +1365,18 @@ func simdGenericOps() []opData { {name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true}, {name: "OrMaskedUint64x8", argLength: 3, commutative: true}, + {name: "PermuteUint64x8", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false}, {name: "PermuteInt64x8", argLength: 2, commutative: false}, - {name: "PermuteUint64x8", argLength: 2, commutative: false}, - {name: "Permute2Int64x8", argLength: 3, commutative: false}, {name: "Permute2Float64x8", argLength: 3, commutative: false}, {name: "Permute2Uint64x8", argLength: 3, commutative: false}, + {name: "Permute2Int64x8", argLength: 3, commutative: false}, + {name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false}, {name: "Permute2MaskedUint64x8", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x8", argLength: 4, commutative: false}, - {name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false}, - {name: "PermuteMaskedUint64x8", argLength: 3, commutative: false}, - {name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false}, {name: "PermuteMaskedInt64x8", argLength: 3, commutative: false}, + {name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false}, + {name: "PermuteMaskedUint64x8", argLength: 3, commutative: false}, {name: "PopCountUint64x8", argLength: 1, commutative: false}, {name: "PopCountMaskedUint64x8", argLength: 2, commutative: false}, {name: "RotateLeftUint64x8", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d69e714082..9db3dbaf57 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1199,6 +1199,7 @@ const ( OpAMD64VZEROUPPER OpAMD64VZEROALL OpAMD64KMOVQload + OpAMD64KMOVQstore OpAMD64VADDPS512 OpAMD64VADDPSMasked512 OpAMD64VRCP14PS512 @@ -1229,7 +1230,7 @@ const ( OpAMD64VADDPS128 OpAMD64VADDPSMasked128 OpAMD64VADDSUBPS128 - OpAMD64VRCP14PS128 + OpAMD64VRCPPS128 OpAMD64VRCP14PSMasked128 OpAMD64VRSQRTPS128 OpAMD64VRSQRT14PSMasked128 @@ -1259,7 +1260,7 @@ const ( OpAMD64VADDPS256 OpAMD64VADDPSMasked256 OpAMD64VADDSUBPS256 - OpAMD64VRCP14PS256 + OpAMD64VRCPPS256 OpAMD64VRCP14PSMasked256 OpAMD64VRSQRTPS256 OpAMD64VRSQRT14PSMasked256 @@ -1420,6 +1421,8 @@ const ( OpAMD64VPADDW512 OpAMD64VPADDWMasked512 OpAMD64VPCOMPRESSWMasked512 + OpAMD64VPCMPEQW512 + OpAMD64VPCMPGTW512 OpAMD64VPMAXSW512 OpAMD64VPMAXSWMasked512 OpAMD64VPMINSW512 @@ -1501,6 +1504,8 @@ const ( OpAMD64VPANDND512 OpAMD64VPANDNDMasked512 OpAMD64VPCOMPRESSDMasked512 + OpAMD64VPCMPEQD512 + OpAMD64VPCMPGTD512 OpAMD64VPMAXSD512 OpAMD64VPMAXSDMasked512 OpAMD64VPMINSD512 @@ -1722,6 +1727,8 @@ const ( OpAMD64VPANDNQ512 OpAMD64VPANDNQMasked512 OpAMD64VPCOMPRESSQMasked512 + OpAMD64VPCMPEQQ512 + OpAMD64VPCMPGTQ512 OpAMD64VPMAXSQ512 OpAMD64VPMAXSQMasked512 OpAMD64VPMINSQ512 @@ -1807,6 +1814,8 @@ const ( OpAMD64VPADDB512 OpAMD64VPADDBMasked512 OpAMD64VPCOMPRESSBMasked512 + OpAMD64VPCMPEQB512 + OpAMD64VPCMPGTB512 OpAMD64VPMAXSB512 OpAMD64VPMAXSBMasked512 OpAMD64VPMINSB512 @@ -1888,10 +1897,10 @@ const ( OpAMD64VPMINUD128 OpAMD64VPMINUDMasked128 OpAMD64VPMULUDQ128 - OpAMD64VPERMI2D128 OpAMD64VPERMI2PS128 - OpAMD64VPERMI2PSMasked128 + OpAMD64VPERMI2D128 OpAMD64VPERMI2DMasked128 + OpAMD64VPERMI2PSMasked128 OpAMD64VPSRLD128 OpAMD64VPSRLDMasked128 OpAMD64VPSRLVD128 @@ -1901,12 +1910,12 @@ const ( OpAMD64VPMINUD256 OpAMD64VPMINUDMasked256 OpAMD64VPMULUDQ256 - OpAMD64VPERMPS256 OpAMD64VPERMD256 - OpAMD64VPERMI2D256 + OpAMD64VPERMPS256 OpAMD64VPERMI2PS256 - OpAMD64VPERMI2DMasked256 + OpAMD64VPERMI2D256 OpAMD64VPERMI2PSMasked256 + OpAMD64VPERMI2DMasked256 OpAMD64VPERMPSMasked256 OpAMD64VPERMDMasked256 OpAMD64VPSRLD256 @@ -1931,10 +1940,10 @@ const ( OpAMD64VPMINUQ256 OpAMD64VPMINUQMasked256 OpAMD64VPMULUDQMasked256 - OpAMD64VPERMQ256 OpAMD64VPERMPD256 - OpAMD64VPERMI2Q256 + OpAMD64VPERMQ256 OpAMD64VPERMI2PD256 + OpAMD64VPERMI2Q256 OpAMD64VPERMI2PDMasked256 OpAMD64VPERMI2QMasked256 OpAMD64VPERMQMasked256 @@ -1955,8 +1964,8 @@ const ( OpAMD64VPERMI2PD512 OpAMD64VPERMI2QMasked512 OpAMD64VPERMI2PDMasked512 - OpAMD64VPERMQMasked512 OpAMD64VPERMPDMasked512 + OpAMD64VPERMQMasked512 OpAMD64VPSRLQ512 OpAMD64VPSRLQMasked512 OpAMD64VPSRLVQ512 @@ -2054,8 +2063,8 @@ const ( OpAMD64VPSHLDWMasked256 OpAMD64VPSHRDW256 OpAMD64VPSHRDWMasked256 - OpAMD64VPCMPW512 OpAMD64VPCMPWMasked512 + OpAMD64VPCMPW512 OpAMD64VPSHLDW512 OpAMD64VPSHLDWMasked512 OpAMD64VPSHRDW512 @@ -2068,8 +2077,8 @@ const ( OpAMD64VPSHLDWMasked128 OpAMD64VPSHRDW128 OpAMD64VPSHRDWMasked128 - OpAMD64VPCMPD512 OpAMD64VPCMPDMasked512 + OpAMD64VPCMPD512 OpAMD64VPROLD512 OpAMD64VPROLDMasked512 OpAMD64VPRORD512 @@ -2122,8 +2131,8 @@ const ( OpAMD64VPSHLDQMasked256 OpAMD64VPSHRDQ256 OpAMD64VPSHRDQMasked256 - OpAMD64VPCMPQ512 OpAMD64VPCMPQMasked512 + OpAMD64VPCMPQ512 OpAMD64VPROLQ512 OpAMD64VPROLQMasked512 OpAMD64VPRORQ512 @@ -2140,16 +2149,16 @@ const ( OpAMD64VEXTRACTI128128 OpAMD64VPCMPB256 OpAMD64VINSERTI128256 - OpAMD64VPCMPB512 OpAMD64VPCMPBMasked512 + OpAMD64VPCMPB512 OpAMD64VPCMPUWMasked256 OpAMD64VPCMPUW256 - OpAMD64VPCMPUW512 OpAMD64VPCMPUWMasked512 + OpAMD64VPCMPUW512 OpAMD64VPCMPUWMasked128 OpAMD64VPCMPUW128 - OpAMD64VPCMPUD512 OpAMD64VPCMPUDMasked512 + OpAMD64VPCMPUD512 OpAMD64VPCMPUDMasked128 OpAMD64VPCMPUD128 OpAMD64VPCMPUDMasked256 @@ -2158,8 +2167,8 @@ const ( OpAMD64VPCMPUQ128 OpAMD64VPCMPUQMasked256 OpAMD64VPCMPUQ256 - OpAMD64VPCMPUQ512 OpAMD64VPCMPUQMasked512 + OpAMD64VPCMPUQ512 OpAMD64VPCMPUBMasked128 OpAMD64VGF2P8AFFINEQB128 OpAMD64VGF2P8AFFINEINVQB128 @@ -2172,12 +2181,12 @@ const ( OpAMD64VGF2P8AFFINEINVQBMasked256 OpAMD64VGF2P8AFFINEQBMasked256 OpAMD64VPCMPUB256 - OpAMD64VPCMPUB512 OpAMD64VPCMPUBMasked512 OpAMD64VGF2P8AFFINEQB512 OpAMD64VGF2P8AFFINEINVQB512 OpAMD64VGF2P8AFFINEINVQBMasked512 OpAMD64VGF2P8AFFINEQBMasked512 + OpAMD64VPCMPUB512 OpARMADD OpARMADDconst @@ -4416,6 +4425,18 @@ const ( OpLoadMask64x2 OpLoadMask64x4 OpLoadMask64x8 + OpStoreMask8x16 + OpStoreMask8x32 + OpStoreMask8x64 + OpStoreMask16x8 + OpStoreMask16x16 + OpStoreMask16x32 + OpStoreMask32x4 + OpStoreMask32x8 + OpStoreMask32x16 + OpStoreMask64x2 + OpStoreMask64x4 + OpStoreMask64x8 OpAddFloat32x16 OpAddMaskedFloat32x16 OpApproximateReciprocalFloat32x16 @@ -5325,10 +5346,10 @@ const ( OpPermuteUint16x16 OpPermute2Uint16x16 OpPermute2Int16x16 - OpPermute2MaskedInt16x16 OpPermute2MaskedUint16x16 - OpPermuteMaskedUint16x16 + OpPermute2MaskedInt16x16 OpPermuteMaskedInt16x16 + OpPermuteMaskedUint16x16 OpPopCountUint16x16 OpPopCountMaskedUint16x16 OpSaturatedAddUint16x16 @@ -5379,8 +5400,8 @@ const ( OpPermute2Int16x32 OpPermute2MaskedUint16x32 OpPermute2MaskedInt16x32 - OpPermuteMaskedUint16x32 OpPermuteMaskedInt16x32 + OpPermuteMaskedUint16x32 OpPopCountUint16x32 OpPopCountMaskedUint16x32 OpSaturatedAddUint16x32 @@ -5431,12 +5452,12 @@ const ( OpPairwiseSubUint16x8 OpPermuteInt16x8 OpPermuteUint16x8 - OpPermute2Int16x8 OpPermute2Uint16x8 + OpPermute2Int16x8 OpPermute2MaskedInt16x8 OpPermute2MaskedUint16x8 - OpPermuteMaskedUint16x8 OpPermuteMaskedInt16x8 + OpPermuteMaskedUint16x8 OpPopCountUint16x8 OpPopCountMaskedUint16x8 OpSaturatedAddUint16x8 @@ -5483,17 +5504,17 @@ const ( OpNotEqualMaskedUint32x16 OpOrUint32x16 OpOrMaskedUint32x16 - OpPermuteFloat32x16 OpPermuteInt32x16 + OpPermuteFloat32x16 OpPermuteUint32x16 OpPermute2Uint32x16 OpPermute2Float32x16 OpPermute2Int32x16 + OpPermute2MaskedUint32x16 OpPermute2MaskedInt32x16 OpPermute2MaskedFloat32x16 - OpPermute2MaskedUint32x16 - OpPermuteMaskedInt32x16 OpPermuteMaskedFloat32x16 + OpPermuteMaskedInt32x16 OpPermuteMaskedUint32x16 OpPopCountUint32x16 OpPopCountMaskedUint32x16 @@ -5720,15 +5741,15 @@ const ( OpPermuteUint64x4 OpPermuteInt64x4 OpPermuteFloat64x4 - OpPermute2Float64x4 - OpPermute2Int64x4 OpPermute2Uint64x4 - OpPermute2MaskedFloat64x4 + OpPermute2Int64x4 + OpPermute2Float64x4 OpPermute2MaskedUint64x4 + OpPermute2MaskedFloat64x4 OpPermute2MaskedInt64x4 + OpPermuteMaskedUint64x4 OpPermuteMaskedFloat64x4 OpPermuteMaskedInt64x4 - OpPermuteMaskedUint64x4 OpPopCountUint64x4 OpPopCountMaskedUint64x4 OpRotateLeftUint64x4 @@ -5778,18 +5799,18 @@ const ( OpNotEqualMaskedUint64x8 OpOrUint64x8 OpOrMaskedUint64x8 + OpPermuteUint64x8 OpPermuteFloat64x8 OpPermuteInt64x8 - OpPermuteUint64x8 - OpPermute2Int64x8 OpPermute2Float64x8 OpPermute2Uint64x8 + OpPermute2Int64x8 + OpPermute2MaskedFloat64x8 OpPermute2MaskedUint64x8 OpPermute2MaskedInt64x8 - OpPermute2MaskedFloat64x8 - OpPermuteMaskedUint64x8 - OpPermuteMaskedFloat64x8 OpPermuteMaskedInt64x8 + OpPermuteMaskedFloat64x8 + OpPermuteMaskedUint64x8 OpPopCountUint64x8 OpPopCountMaskedUint64x8 OpRotateLeftUint64x8 @@ -18830,6 +18851,20 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "KMOVQstore", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AKMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, { name: "VADDPS512", argLen: 2, @@ -19281,15 +19316,15 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VRCP14PS128", + name: "VRCPPS128", argLen: 1, - asm: x86.AVRCP14PS, + asm: x86.AVRCPPS, reg: regInfo{ inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, }, }, @@ -19728,15 +19763,15 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VRCP14PS256", + name: "VRCPPS256", argLen: 1, - asm: x86.AVRCP14PS, + asm: x86.AVRCPPS, reg: regInfo{ inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, }, }, @@ -22122,6 +22157,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCMPEQW512", + argLen: 2, + commutative: true, + asm: x86.AVPCMPEQW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "VPCMPGTW512", + argLen: 2, + asm: x86.AVPCMPGTW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VPMAXSW512", argLen: 2, @@ -23327,6 +23391,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCMPEQD512", + argLen: 2, + commutative: true, + asm: x86.AVPCMPEQD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "VPCMPGTD512", + argLen: 2, + asm: x86.AVPCMPGTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VPMAXSD512", argLen: 2, @@ -26664,6 +26757,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCMPEQQ512", + argLen: 2, + commutative: true, + asm: x86.AVPCMPEQQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "VPCMPGTQ512", + argLen: 2, + asm: x86.AVPCMPGTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VPMAXSQ512", argLen: 2, @@ -27922,6 +28044,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCMPEQB512", + argLen: 2, + commutative: true, + asm: x86.AVPCMPEQB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "VPCMPGTB512", + argLen: 2, + asm: x86.AVPCMPGTB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VPMAXSB512", argLen: 2, @@ -29154,10 +29305,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2D128", + name: "VPERMI2PS128", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2D, + asm: x86.AVPERMI2PS, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29170,10 +29321,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2PS128", + name: "VPERMI2D128", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2PS, + asm: x86.AVPERMI2D, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29186,10 +29337,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2PSMasked128", + name: "VPERMI2DMasked128", argLen: 4, resultInArg0: true, - asm: x86.AVPERMI2PS, + asm: x86.AVPERMI2D, reg: regInfo{ inputs: []inputInfo{ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -29203,10 +29354,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2DMasked128", + name: "VPERMI2PSMasked128", argLen: 4, resultInArg0: true, - asm: x86.AVPERMI2D, + asm: x86.AVPERMI2PS, reg: regInfo{ inputs: []inputInfo{ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -29355,9 +29506,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMPS256", + name: "VPERMD256", argLen: 2, - asm: x86.AVPERMPS, + asm: x86.AVPERMD, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29369,9 +29520,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMD256", + name: "VPERMPS256", argLen: 2, - asm: x86.AVPERMD, + asm: x86.AVPERMPS, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29383,10 +29534,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2D256", + name: "VPERMI2PS256", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2D, + asm: x86.AVPERMI2PS, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29399,10 +29550,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2PS256", + name: "VPERMI2D256", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2PS, + asm: x86.AVPERMI2D, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29415,10 +29566,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2DMasked256", + name: "VPERMI2PSMasked256", argLen: 4, resultInArg0: true, - asm: x86.AVPERMI2D, + asm: x86.AVPERMI2PS, reg: regInfo{ inputs: []inputInfo{ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -29432,10 +29583,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2PSMasked256", + name: "VPERMI2DMasked256", argLen: 4, resultInArg0: true, - asm: x86.AVPERMI2PS, + asm: x86.AVPERMI2D, reg: regInfo{ inputs: []inputInfo{ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -29817,9 +29968,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMQ256", + name: "VPERMPD256", argLen: 2, - asm: x86.AVPERMQ, + asm: x86.AVPERMPD, reg: regInfo{ inputs: []inputInfo{ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 @@ -29831,9 +29982,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMPD256", + name: "VPERMQ256", argLen: 2, - asm: x86.AVPERMPD, + asm: x86.AVPERMQ, reg: regInfo{ inputs: []inputInfo{ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 @@ -29845,10 +29996,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2Q256", + name: "VPERMI2PD256", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2Q, + asm: x86.AVPERMI2PD, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -29861,10 +30012,10 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMI2PD256", + name: "VPERMI2Q256", argLen: 3, resultInArg0: true, - asm: x86.AVPERMI2PD, + asm: x86.AVPERMI2Q, reg: regInfo{ inputs: []inputInfo{ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 @@ -30186,9 +30337,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMQMasked512", + name: "VPERMPDMasked512", argLen: 3, - asm: x86.AVPERMQ, + asm: x86.AVPERMPD, reg: regInfo{ inputs: []inputInfo{ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -30201,9 +30352,9 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPERMPDMasked512", + name: "VPERMQMasked512", argLen: 3, - asm: x86.AVPERMPD, + asm: x86.AVPERMQ, reg: regInfo{ inputs: []inputInfo{ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -31686,15 +31837,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPW512", + name: "VPCMPWMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPW, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -31702,16 +31854,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPWMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPW, + name: "VPCMPW512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPW, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -31904,15 +32054,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPD512", + name: "VPCMPDMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPD, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -31920,16 +32071,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPDMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPD, + name: "VPCMPD512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPD, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -32723,15 +32872,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPQ512", + name: "VPCMPQMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPQ, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -32739,16 +32889,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPQMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPQ, + name: "VPCMPQ512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPQ, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -32998,15 +33146,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPB512", + name: "VPCMPBMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPB, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33014,16 +33163,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPBMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPB, + name: "VPCMPB512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPB, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33063,15 +33210,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUW512", + name: "VPCMPUWMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPUW, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33079,16 +33227,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUWMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPUW, + name: "VPCMPUW512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPUW, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33128,15 +33274,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUD512", + name: "VPCMPUDMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPUD, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33144,16 +33291,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUDMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPUD, + name: "VPCMPUD512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPUD, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33289,15 +33434,16 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUQ512", + name: "VPCMPUQMasked512", auxType: auxInt8, - argLen: 2, + argLen: 3, commutative: true, asm: x86.AVPCMPUQ, reg: regInfo{ inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33305,16 +33451,14 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "VPCMPUQMasked512", - auxType: auxInt8, - argLen: 3, - commutative: true, - asm: x86.AVPCMPUQ, + name: "VPCMPUQ512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPUQ, reg: regInfo{ inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, outputs: []outputInfo{ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 @@ -33509,22 +33653,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPCMPUB512", - auxType: auxInt8, - argLen: 2, - commutative: true, - asm: x86.AVPCMPUB, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - }, - }, - }, { name: "VPCMPUBMasked512", auxType: auxInt8, @@ -33604,6 +33732,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPCMPUB512", + auxType: auxInt8, + argLen: 2, + asm: x86.AVPCMPUB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "ADD", @@ -60816,6 +60959,78 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "StoreMask8x16", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask8x32", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask8x64", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask16x8", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask16x16", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask16x32", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask32x4", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask32x8", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask32x16", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask64x2", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask64x4", + auxType: auxTyp, + argLen: 3, + generic: true, + }, + { + name: "StoreMask64x8", + auxType: auxTyp, + argLen: 3, + generic: true, + }, { name: "AddFloat32x16", argLen: 2, @@ -65677,22 +65892,22 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedInt16x16", + name: "Permute2MaskedUint16x16", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint16x16", + name: "Permute2MaskedInt16x16", argLen: 4, generic: true, }, { - name: "PermuteMaskedUint16x16", + name: "PermuteMaskedInt16x16", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt16x16", + name: "PermuteMaskedUint16x16", argLen: 3, generic: true, }, @@ -65964,12 +66179,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedUint16x32", + name: "PermuteMaskedInt16x32", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt16x32", + name: "PermuteMaskedUint16x32", argLen: 3, generic: true, }, @@ -66242,12 +66457,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Int16x8", + name: "Permute2Uint16x8", argLen: 3, generic: true, }, { - name: "Permute2Uint16x8", + name: "Permute2Int16x8", argLen: 3, generic: true, }, @@ -66262,12 +66477,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedUint16x8", + name: "PermuteMaskedInt16x8", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt16x8", + name: "PermuteMaskedUint16x8", argLen: 3, generic: true, }, @@ -66519,12 +66734,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteFloat32x16", + name: "PermuteInt32x16", argLen: 2, generic: true, }, { - name: "PermuteInt32x16", + name: "PermuteFloat32x16", argLen: 2, generic: true, }, @@ -66549,27 +66764,27 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedInt32x16", + name: "Permute2MaskedUint32x16", argLen: 4, generic: true, }, { - name: "Permute2MaskedFloat32x16", + name: "Permute2MaskedInt32x16", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint32x16", + name: "Permute2MaskedFloat32x16", argLen: 4, generic: true, }, { - name: "PermuteMaskedInt32x16", + name: "PermuteMaskedFloat32x16", argLen: 3, generic: true, }, { - name: "PermuteMaskedFloat32x16", + name: "PermuteMaskedInt32x16", argLen: 3, generic: true, }, @@ -67774,7 +67989,7 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Float64x4", + name: "Permute2Uint64x4", argLen: 3, generic: true, }, @@ -67784,17 +67999,17 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Uint64x4", + name: "Permute2Float64x4", argLen: 3, generic: true, }, { - name: "Permute2MaskedFloat64x4", + name: "Permute2MaskedUint64x4", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint64x4", + name: "Permute2MaskedFloat64x4", argLen: 4, generic: true, }, @@ -67804,17 +68019,17 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedFloat64x4", + name: "PermuteMaskedUint64x4", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt64x4", + name: "PermuteMaskedFloat64x4", argLen: 3, generic: true, }, { - name: "PermuteMaskedUint64x4", + name: "PermuteMaskedInt64x4", argLen: 3, generic: true, }, @@ -68082,52 +68297,52 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteFloat64x8", + name: "PermuteUint64x8", argLen: 2, generic: true, }, { - name: "PermuteInt64x8", + name: "PermuteFloat64x8", argLen: 2, generic: true, }, { - name: "PermuteUint64x8", + name: "PermuteInt64x8", argLen: 2, generic: true, }, { - name: "Permute2Int64x8", + name: "Permute2Float64x8", argLen: 3, generic: true, }, { - name: "Permute2Float64x8", + name: "Permute2Uint64x8", argLen: 3, generic: true, }, { - name: "Permute2Uint64x8", + name: "Permute2Int64x8", argLen: 3, generic: true, }, { - name: "Permute2MaskedUint64x8", + name: "Permute2MaskedFloat64x8", argLen: 4, generic: true, }, { - name: "Permute2MaskedInt64x8", + name: "Permute2MaskedUint64x8", argLen: 4, generic: true, }, { - name: "Permute2MaskedFloat64x8", + name: "Permute2MaskedInt64x8", argLen: 4, generic: true, }, { - name: "PermuteMaskedUint64x8", + name: "PermuteMaskedInt64x8", argLen: 3, generic: true, }, @@ -68137,7 +68352,7 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedInt64x8", + name: "PermuteMaskedUint64x8", argLen: 3, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 0ff19a680e..ecd4a21f43 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -985,10 +985,10 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VRCP14PS512 return true case OpApproximateReciprocalFloat32x4: - v.Op = OpAMD64VRCP14PS128 + v.Op = OpAMD64VRCPPS128 return true case OpApproximateReciprocalFloat32x8: - v.Op = OpAMD64VRCP14PS256 + v.Op = OpAMD64VRCPPS256 return true case OpApproximateReciprocalFloat64x2: v.Op = OpAMD64VRCP14PD128 @@ -5184,6 +5184,30 @@ func rewriteValueAMD64(v *Value) bool { return true case OpStore: return rewriteValueAMD64_OpStore(v) + case OpStoreMask16x16: + return rewriteValueAMD64_OpStoreMask16x16(v) + case OpStoreMask16x32: + return rewriteValueAMD64_OpStoreMask16x32(v) + case OpStoreMask16x8: + return rewriteValueAMD64_OpStoreMask16x8(v) + case OpStoreMask32x16: + return rewriteValueAMD64_OpStoreMask32x16(v) + case OpStoreMask32x4: + return rewriteValueAMD64_OpStoreMask32x4(v) + case OpStoreMask32x8: + return rewriteValueAMD64_OpStoreMask32x8(v) + case OpStoreMask64x2: + return rewriteValueAMD64_OpStoreMask64x2(v) + case OpStoreMask64x4: + return rewriteValueAMD64_OpStoreMask64x4(v) + case OpStoreMask64x8: + return rewriteValueAMD64_OpStoreMask64x8(v) + case OpStoreMask8x16: + return rewriteValueAMD64_OpStoreMask8x16(v) + case OpStoreMask8x32: + return rewriteValueAMD64_OpStoreMask8x32(v) + case OpStoreMask8x64: + return rewriteValueAMD64_OpStoreMask8x64(v) case OpSub16: v.Op = OpAMD64SUBL return true @@ -33388,13 +33412,12 @@ func rewriteValueAMD64_OpEqualInt16x32(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualInt16x32 x y) - // result: (VPMOVMToVec16x32 (VPCMPW512 [0] x y)) + // result: (VPMOVMToVec16x32 (VPCMPEQW512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec16x32) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -33406,13 +33429,12 @@ func rewriteValueAMD64_OpEqualInt32x16(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualInt32x16 x y) - // result: (VPMOVMToVec32x16 (VPCMPD512 [0] x y)) + // result: (VPMOVMToVec32x16 (VPCMPEQD512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec32x16) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -33424,13 +33446,12 @@ func rewriteValueAMD64_OpEqualInt64x8(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualInt64x8 x y) - // result: (VPMOVMToVec64x8 (VPCMPQ512 [0] x y)) + // result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec64x8) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -33442,13 +33463,12 @@ func rewriteValueAMD64_OpEqualInt8x64(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualInt8x64 x y) - // result: (VPMOVMToVec8x64 (VPCMPB512 [0] x y)) + // result: (VPMOVMToVec8x64 (VPCMPEQB512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec8x64) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -34120,13 +34140,12 @@ func rewriteValueAMD64_OpEqualUint16x32(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualUint16x32 x y) - // result: (VPMOVMToVec16x32 (VPCMPUW512 [0] x y)) + // result: (VPMOVMToVec16x32 (VPCMPEQW512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec16x32) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUW512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -34138,13 +34157,12 @@ func rewriteValueAMD64_OpEqualUint32x16(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualUint32x16 x y) - // result: (VPMOVMToVec32x16 (VPCMPUD512 [0] x y)) + // result: (VPMOVMToVec32x16 (VPCMPEQD512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec32x16) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUD512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -34156,13 +34174,12 @@ func rewriteValueAMD64_OpEqualUint64x8(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualUint64x8 x y) - // result: (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y)) + // result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec64x8) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUQ512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -34174,13 +34191,12 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (EqualUint8x64 x y) - // result: (VPMOVMToVec8x64 (VPCMPUB512 [0] x y)) + // result: (VPMOVMToVec8x64 (VPCMPEQB512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec8x64) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUB512, typ.Mask) - v0.AuxInt = int8ToAuxInt(0) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -36279,13 +36295,12 @@ func rewriteValueAMD64_OpGreaterInt16x32(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (GreaterInt16x32 x y) - // result: (VPMOVMToVec16x32 (VPCMPW512 [14] x y)) + // result: (VPMOVMToVec16x32 (VPCMPGTW512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec16x32) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask) - v0.AuxInt = int8ToAuxInt(14) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTW512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -36297,13 +36312,12 @@ func rewriteValueAMD64_OpGreaterInt32x16(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (GreaterInt32x16 x y) - // result: (VPMOVMToVec32x16 (VPCMPD512 [14] x y)) + // result: (VPMOVMToVec32x16 (VPCMPGTD512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec32x16) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask) - v0.AuxInt = int8ToAuxInt(14) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTD512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -36315,13 +36329,12 @@ func rewriteValueAMD64_OpGreaterInt64x8(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (GreaterInt64x8 x y) - // result: (VPMOVMToVec64x8 (VPCMPQ512 [14] x y)) + // result: (VPMOVMToVec64x8 (VPCMPGTQ512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec64x8) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask) - v0.AuxInt = int8ToAuxInt(14) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -36333,13 +36346,12 @@ func rewriteValueAMD64_OpGreaterInt8x64(v *Value) bool { b := v.Block typ := &b.Func.Config.Types // match: (GreaterInt8x64 x y) - // result: (VPMOVMToVec8x64 (VPCMPB512 [14] x y)) + // result: (VPMOVMToVec8x64 (VPCMPGTB512 x y)) for { x := v_0 y := v_1 v.reset(OpAMD64VPMOVMToVec8x64) - v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask) - v0.AuxInt = int8ToAuxInt(14) + v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTB512, typ.Mask) v0.AddArg2(x, y) v.AddArg(v0) return true @@ -53277,6 +53289,234 @@ func rewriteValueAMD64_OpStore(v *Value) bool { } return false } +func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask16x16 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec16x16ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask16x32 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec16x32ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask16x8 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec16x8ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask32x16 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec32x16ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask32x4 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec32x4ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask32x8 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec32x8ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask64x2 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec64x2ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask64x4 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec64x4ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask64x8 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec64x8ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask8x16 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec8x16ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask8x32 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec8x32ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} +func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMask8x64 {t} ptr val mem) + // result: (KMOVQstore ptr (VPMOVVec8x64ToM val) mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t) + v0.AddArg(val) + v.AddArg3(ptr, v0, mem) + return true + } +} func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index e012b536b5..0284729a52 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1791,6 +1791,23 @@ func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ss } } +func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + opCodes := map[int]map[int]ssa.Op{ + 8: {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64}, + 16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32}, + 32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16}, + 64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8}, + } + op := opCodes[elemBits][lanes] + if op == 0 { + panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) + } + s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem()) + return nil + } +} + // findIntrinsic returns a function which builds the SSA equivalent of the // function identified by the symbol sym. If sym is not an intrinsic call, returns nil. func findIntrinsic(sym *types.Sym) intrinsicBuilder { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 8040a187bd..8b3b08f886 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -310,34 +310,34 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.Equal", opLen2(ssa.OpEqualInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.Equal", opLen2(ssa.OpEqualInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.Equal", opLen2(ssa.OpEqualInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.Equal", opLen2(ssa.OpEqualInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x2.Equal", opLen2(ssa.OpEqualInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.Equal", opLen2(ssa.OpEqualInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.Equal", opLen2(ssa.OpEqualUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Equal", opLen2(ssa.OpEqualUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint16x8.Equal", opLen2(ssa.OpEqualUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.Equal", opLen2(ssa.OpEqualUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.Equal", opLen2(ssa.OpEqualUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.Equal", opLen2(ssa.OpEqualUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x2.Equal", opLen2(ssa.OpEqualUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Equal", opLen2(ssa.OpEqualUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Equal", opLen2(ssa.OpEqualFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Equal", opLen2(ssa.OpEqualFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Equal", opLen2(ssa.OpEqualFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x2.Equal", opLen2(ssa.OpEqualFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Equal", opLen2(ssa.OpEqualFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Equal", opLen2(ssa.OpEqualFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x16, types.TypeVec512), sys.AMD64) @@ -458,22 +458,22 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.GetElem", opLen1Imm8(ssa.OpGetElemUint64x2, types.Types[types.TUINT64], 0), sys.AMD64) addF(simdPackage, "Int8x16.Greater", opLen2(ssa.OpGreaterInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Greater", opLen2(ssa.OpGreaterInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.Greater", opLen2(ssa.OpGreaterInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.Greater", opLen2(ssa.OpGreaterInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.Greater", opLen2(ssa.OpGreaterInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.Greater", opLen2(ssa.OpGreaterInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x2.Greater", opLen2(ssa.OpGreaterInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.Greater", opLen2(ssa.OpGreaterInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Greater", opLen2(ssa.OpGreaterFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Greater", opLen2(ssa.OpGreaterFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Greater", opLen2(ssa.OpGreaterFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x2.Greater", opLen2(ssa.OpGreaterFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Greater", opLen2(ssa.OpGreaterFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Greater", opLen2(ssa.OpGreaterFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.Greater", opLen2(ssa.OpGreaterUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Greater", opLen2(ssa.OpGreaterUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.Greater", opLen2(ssa.OpGreaterUint8x64, types.TypeVec512), sys.AMD64) @@ -2137,59 +2137,71 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64) + addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64) addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64) + addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64) addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64) + addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64) addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64) + addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64) addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64) + addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64) addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64) + addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64) addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64) + addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64) addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64) + addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64) addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64) + addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64) addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64) + addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64) addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64) + addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64) addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64) + addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64) } diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index a5c2f2d5c2..318883ea19 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -918,12 +918,12 @@ func (x Uint64x8) AndNotMasked(y Uint64x8, mask Mask64x8) Uint64x8 // ApproximateReciprocal computes an approximate reciprocal of each element. // -// Asm: VRCP14PS, CPU Feature: AVX512F +// Asm: VRCPPS, CPU Feature: AVX func (x Float32x4) ApproximateReciprocal() Float32x4 // ApproximateReciprocal computes an approximate reciprocal of each element. // -// Asm: VRCP14PS, CPU Feature: AVX512F +// Asm: VRCPPS, CPU Feature: AVX func (x Float32x8) ApproximateReciprocal() Float32x8 // ApproximateReciprocal computes an approximate reciprocal of each element. @@ -1951,6 +1951,11 @@ func (x Int8x16) Equal(y Int8x16) Mask8x16 // Asm: VPCMPEQB, CPU Feature: AVX2 func (x Int8x32) Equal(y Int8x32) Mask8x32 +// Equal compares for equality. +// +// Asm: VPCMPEQB, CPU Feature: AVX512BW +func (x Int8x64) Equal(y Int8x64) Mask8x64 + // Equal compares for equality. // // Asm: VPCMPEQW, CPU Feature: AVX @@ -1961,6 +1966,11 @@ func (x Int16x8) Equal(y Int16x8) Mask16x8 // Asm: VPCMPEQW, CPU Feature: AVX2 func (x Int16x16) Equal(y Int16x16) Mask16x16 +// Equal compares for equality. +// +// Asm: VPCMPEQW, CPU Feature: AVX512BW +func (x Int16x32) Equal(y Int16x32) Mask16x32 + // Equal compares for equality. // // Asm: VPCMPEQD, CPU Feature: AVX @@ -1971,6 +1981,11 @@ func (x Int32x4) Equal(y Int32x4) Mask32x4 // Asm: VPCMPEQD, CPU Feature: AVX2 func (x Int32x8) Equal(y Int32x8) Mask32x8 +// Equal compares for equality. +// +// Asm: VPCMPEQD, CPU Feature: AVX512F +func (x Int32x16) Equal(y Int32x16) Mask32x16 + // Equal compares for equality. // // Asm: VPCMPEQQ, CPU Feature: AVX @@ -1981,6 +1996,11 @@ func (x Int64x2) Equal(y Int64x2) Mask64x2 // Asm: VPCMPEQQ, CPU Feature: AVX2 func (x Int64x4) Equal(y Int64x4) Mask64x4 +// Equal compares for equality. +// +// Asm: VPCMPEQQ, CPU Feature: AVX512F +func (x Int64x8) Equal(y Int64x8) Mask64x8 + // Equal compares for equality. // // Asm: VPCMPEQB, CPU Feature: AVX @@ -1991,6 +2011,11 @@ func (x Uint8x16) Equal(y Uint8x16) Mask8x16 // Asm: VPCMPEQB, CPU Feature: AVX2 func (x Uint8x32) Equal(y Uint8x32) Mask8x32 +// Equal compares for equality. +// +// Asm: VPCMPEQB, CPU Feature: AVX512BW +func (x Uint8x64) Equal(y Uint8x64) Mask8x64 + // Equal compares for equality. // // Asm: VPCMPEQW, CPU Feature: AVX @@ -2001,6 +2026,11 @@ func (x Uint16x8) Equal(y Uint16x8) Mask16x8 // Asm: VPCMPEQW, CPU Feature: AVX2 func (x Uint16x16) Equal(y Uint16x16) Mask16x16 +// Equal compares for equality. +// +// Asm: VPCMPEQW, CPU Feature: AVX512BW +func (x Uint16x32) Equal(y Uint16x32) Mask16x32 + // Equal compares for equality. // // Asm: VPCMPEQD, CPU Feature: AVX @@ -2011,6 +2041,11 @@ func (x Uint32x4) Equal(y Uint32x4) Mask32x4 // Asm: VPCMPEQD, CPU Feature: AVX2 func (x Uint32x8) Equal(y Uint32x8) Mask32x8 +// Equal compares for equality. +// +// Asm: VPCMPEQD, CPU Feature: AVX512F +func (x Uint32x16) Equal(y Uint32x16) Mask32x16 + // Equal compares for equality. // // Asm: VPCMPEQQ, CPU Feature: AVX @@ -2021,6 +2056,11 @@ func (x Uint64x2) Equal(y Uint64x2) Mask64x2 // Asm: VPCMPEQQ, CPU Feature: AVX2 func (x Uint64x4) Equal(y Uint64x4) Mask64x4 +// Equal compares for equality. +// +// Asm: VPCMPEQQ, CPU Feature: AVX512F +func (x Uint64x8) Equal(y Uint64x8) Mask64x8 + // Equal compares for equality. // // Asm: VCMPPS, CPU Feature: AVX @@ -2051,46 +2091,6 @@ func (x Float64x4) Equal(y Float64x4) Mask64x4 // Asm: VCMPPD, CPU Feature: AVX512F func (x Float64x8) Equal(y Float64x8) Mask64x8 -// Equal compares for equality. -// -// Asm: VPCMPB, CPU Feature: AVX512BW -func (x Int8x64) Equal(y Int8x64) Mask8x64 - -// Equal compares for equality. -// -// Asm: VPCMPW, CPU Feature: AVX512BW -func (x Int16x32) Equal(y Int16x32) Mask16x32 - -// Equal compares for equality. -// -// Asm: VPCMPD, CPU Feature: AVX512F -func (x Int32x16) Equal(y Int32x16) Mask32x16 - -// Equal compares for equality. -// -// Asm: VPCMPQ, CPU Feature: AVX512F -func (x Int64x8) Equal(y Int64x8) Mask64x8 - -// Equal compares for equality. -// -// Asm: VPCMPUB, CPU Feature: AVX512BW -func (x Uint8x64) Equal(y Uint8x64) Mask8x64 - -// Equal compares for equality. -// -// Asm: VPCMPUW, CPU Feature: AVX512BW -func (x Uint16x32) Equal(y Uint16x32) Mask16x32 - -// Equal compares for equality. -// -// Asm: VPCMPUD, CPU Feature: AVX512F -func (x Uint32x16) Equal(y Uint32x16) Mask32x16 - -// Equal compares for equality. -// -// Asm: VPCMPUQ, CPU Feature: AVX512F -func (x Uint64x8) Equal(y Uint64x8) Mask64x8 - /* EqualMasked */ // EqualMasked compares for equality. @@ -2733,7 +2733,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x6 // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI -func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16 +func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16 // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8), // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1: @@ -2746,7 +2746,7 @@ func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI -func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32 +func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32 // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8), // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1: @@ -2759,7 +2759,7 @@ func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI -func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64 +func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64 /* GaloisFieldAffineTransformMasked */ @@ -2773,7 +2773,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI -func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16 +func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16 // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8): // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes; @@ -2785,7 +2785,7 @@ func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI -func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32 +func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32 // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8): // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes; @@ -2797,7 +2797,7 @@ func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x // b is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI -func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64 +func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64 /* GaloisFieldMul */ @@ -2987,6 +2987,11 @@ func (x Int8x16) Greater(y Int8x16) Mask8x16 // Asm: VPCMPGTB, CPU Feature: AVX2 func (x Int8x32) Greater(y Int8x32) Mask8x32 +// Greater compares for greater than. +// +// Asm: VPCMPGTB, CPU Feature: AVX512BW +func (x Int8x64) Greater(y Int8x64) Mask8x64 + // Greater compares for greater than. // // Asm: VPCMPGTW, CPU Feature: AVX @@ -2997,6 +3002,11 @@ func (x Int16x8) Greater(y Int16x8) Mask16x8 // Asm: VPCMPGTW, CPU Feature: AVX2 func (x Int16x16) Greater(y Int16x16) Mask16x16 +// Greater compares for greater than. +// +// Asm: VPCMPGTW, CPU Feature: AVX512BW +func (x Int16x32) Greater(y Int16x32) Mask16x32 + // Greater compares for greater than. // // Asm: VPCMPGTD, CPU Feature: AVX @@ -3007,6 +3017,11 @@ func (x Int32x4) Greater(y Int32x4) Mask32x4 // Asm: VPCMPGTD, CPU Feature: AVX2 func (x Int32x8) Greater(y Int32x8) Mask32x8 +// Greater compares for greater than. +// +// Asm: VPCMPGTD, CPU Feature: AVX512F +func (x Int32x16) Greater(y Int32x16) Mask32x16 + // Greater compares for greater than. // // Asm: VPCMPGTQ, CPU Feature: AVX @@ -3017,6 +3032,11 @@ func (x Int64x2) Greater(y Int64x2) Mask64x2 // Asm: VPCMPGTQ, CPU Feature: AVX2 func (x Int64x4) Greater(y Int64x4) Mask64x4 +// Greater compares for greater than. +// +// Asm: VPCMPGTQ, CPU Feature: AVX512F +func (x Int64x8) Greater(y Int64x8) Mask64x8 + // Greater compares for greater than. // // Asm: VCMPPS, CPU Feature: AVX @@ -3047,26 +3067,6 @@ func (x Float64x4) Greater(y Float64x4) Mask64x4 // Asm: VCMPPD, CPU Feature: AVX512F func (x Float64x8) Greater(y Float64x8) Mask64x8 -// Greater compares for greater than. -// -// Asm: VPCMPB, CPU Feature: AVX512BW -func (x Int8x64) Greater(y Int8x64) Mask8x64 - -// Greater compares for greater than. -// -// Asm: VPCMPW, CPU Feature: AVX512BW -func (x Int16x32) Greater(y Int16x32) Mask16x32 - -// Greater compares for greater than. -// -// Asm: VPCMPD, CPU Feature: AVX512F -func (x Int32x16) Greater(y Int32x16) Mask32x16 - -// Greater compares for greater than. -// -// Asm: VPCMPQ, CPU Feature: AVX512F -func (x Int64x8) Greater(y Int64x8) Mask64x8 - // Greater compares for greater than. // // Asm: VPCMPUB, CPU Feature: AVX512BW @@ -6475,84 +6475,84 @@ func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8 /* Permute */ -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x16) Permute(indices Uint8x16) Int8x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x16) Permute(indices Uint8x16) Uint8x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x32) Permute(indices Uint8x32) Int8x32 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x32) Permute(indices Uint8x32) Uint8x32 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x64) Permute(indices Uint8x64) Int8x64 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x64) Permute(indices Uint8x64) Uint8x64 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x8) Permute(indices Uint16x8) Int16x8 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMW, CPU Feature: AVX512BW func (x Uint16x8) Permute(indices Uint16x8) Uint16x8 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x16) Permute(indices Uint16x16) Int16x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMW, CPU Feature: AVX512BW func (x Uint16x16) Permute(indices Uint16x16) Uint16x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x32) Permute(indices Uint16x32) Int16x32 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -6580,63 +6580,63 @@ func (x Int32x8) Permute(indices Uint32x8) Int32x8 // Asm: VPERMD, CPU Feature: AVX2 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMPS, CPU Feature: AVX512F func (x Float32x16) Permute(indices Uint32x16) Float32x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMD, CPU Feature: AVX512F func (x Int32x16) Permute(indices Uint32x16) Int32x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMD, CPU Feature: AVX512F func (x Uint32x16) Permute(indices Uint32x16) Uint32x16 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMPD, CPU Feature: AVX512F func (x Float64x4) Permute(indices Uint64x4) Float64x4 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMQ, CPU Feature: AVX512F func (x Int64x4) Permute(indices Uint64x4) Int64x4 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMQ, CPU Feature: AVX512F func (x Uint64x4) Permute(indices Uint64x4) Uint64x4 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMPD, CPU Feature: AVX512F func (x Float64x8) Permute(indices Uint64x8) Float64x8 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // // Asm: VPERMQ, CPU Feature: AVX512F func (x Int64x8) Permute(indices Uint64x8) Int64x8 -// Permute performs a full permutation of vector y using indices: +// Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7189,7 +7189,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui /* PermuteMasked */ -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7198,7 +7198,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7207,7 +7207,7 @@ func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16 // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7216,7 +7216,7 @@ func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16 // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7225,7 +7225,7 @@ func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32 // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7234,7 +7234,7 @@ func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32 // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7243,7 +7243,7 @@ func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64 // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7252,7 +7252,7 @@ func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64 // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7261,7 +7261,7 @@ func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8 // Asm: VPERMW, CPU Feature: AVX512BW func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7270,7 +7270,7 @@ func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8 // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7279,7 +7279,7 @@ func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16 // Asm: VPERMW, CPU Feature: AVX512BW func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7288,7 +7288,7 @@ func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16 // Asm: VPERMW, CPU Feature: AVX512BW func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7297,7 +7297,7 @@ func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32 // Asm: VPERMW, CPU Feature: AVX512BW func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7306,7 +7306,7 @@ func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32 // Asm: VPERMPS, CPU Feature: AVX512F func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7315,7 +7315,7 @@ func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8 // Asm: VPERMD, CPU Feature: AVX512F func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7324,7 +7324,7 @@ func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8 // Asm: VPERMD, CPU Feature: AVX512F func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7333,7 +7333,7 @@ func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8 // Asm: VPERMPS, CPU Feature: AVX512F func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7342,7 +7342,7 @@ func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16 // Asm: VPERMD, CPU Feature: AVX512F func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7351,7 +7351,7 @@ func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16 // Asm: VPERMD, CPU Feature: AVX512F func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7360,7 +7360,7 @@ func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16 // Asm: VPERMPD, CPU Feature: AVX512F func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7369,7 +7369,7 @@ func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4 // Asm: VPERMQ, CPU Feature: AVX512F func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7378,7 +7378,7 @@ func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4 // Asm: VPERMQ, CPU Feature: AVX512F func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7387,7 +7387,7 @@ func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPERMPD, CPU Feature: AVX512F func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // @@ -7396,7 +7396,7 @@ func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8 // Asm: VPERMQ, CPU Feature: AVX512F func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8 -// PermuteMasked performs a full permutation of vector y using indices: +// PermuteMasked performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 276ae9ed5d..d4f539eea2 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -461,7 +461,7 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si } } -func TestBitMask(t *testing.T) { +func TestBitMaskLoad(t *testing.T) { if !simd.HasAVX512() { t.Skip("Test requires HasAVX512, not available on this hardware") return @@ -477,3 +477,19 @@ func TestBitMask(t *testing.T) { } } } + +func TestBitMaskStore(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + var want uint64 = 0b101 + var got uint64 + x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4}) + y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0}) + m := y.Greater(x) + m.StoreToBits(&got) + if got != want { + t.Errorf("Result incorrect: want %b, got %b", want, got) + } +} diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index ccc8427bb3..998a8f9fe1 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -205,48 +205,88 @@ type Mask8x16 struct { vals [16]int8 } -// Mask8x16FromBits constructs a Mask8x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask8x16FromBits(y *uint64) Mask8x16 +// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask8x16) StoreToBits(y *uint64) + // Mask16x8 is a 128-bit SIMD vector of 8 int16 type Mask16x8 struct { int16x8 v128 vals [8]int16 } -// Mask16x8FromBits constructs a Mask16x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask16x8FromBits(y *uint64) Mask16x8 +// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask16x8) StoreToBits(y *uint64) + // Mask32x4 is a 128-bit SIMD vector of 4 int32 type Mask32x4 struct { int32x4 v128 vals [4]int32 } -// Mask32x4FromBits constructs a Mask32x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask32x4FromBits(y *uint64) Mask32x4 +// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask32x4) StoreToBits(y *uint64) + // Mask64x2 is a 128-bit SIMD vector of 2 int64 type Mask64x2 struct { int64x2 v128 vals [2]int64 } -// Mask64x2FromBits constructs a Mask64x2 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 2 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask64x2FromBits(y *uint64) Mask64x2 +// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 2 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask64x2) StoreToBits(y *uint64) + // v256 is a tag type that tells the compiler that this is really 256-bit SIMD type v256 struct { _256 struct{} @@ -448,48 +488,88 @@ type Mask8x32 struct { vals [32]int8 } -// Mask8x32FromBits constructs a Mask8x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask8x32FromBits(y *uint64) Mask8x32 +// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask8x32) StoreToBits(y *uint64) + // Mask16x16 is a 256-bit SIMD vector of 16 int16 type Mask16x16 struct { int16x16 v256 vals [16]int16 } -// Mask16x16FromBits constructs a Mask16x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask16x16FromBits(y *uint64) Mask16x16 +// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask16x16) StoreToBits(y *uint64) + // Mask32x8 is a 256-bit SIMD vector of 8 int32 type Mask32x8 struct { int32x8 v256 vals [8]int32 } -// Mask32x8FromBits constructs a Mask32x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask32x8FromBits(y *uint64) Mask32x8 +// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask32x8) StoreToBits(y *uint64) + // Mask64x4 is a 256-bit SIMD vector of 4 int64 type Mask64x4 struct { int64x4 v256 vals [4]int64 } -// Mask64x4FromBits constructs a Mask64x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask64x4FromBits(y *uint64) Mask64x4 +// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask64x4) StoreToBits(y *uint64) + // v512 is a tag type that tells the compiler that this is really 512-bit SIMD type v512 struct { _512 struct{} @@ -691,44 +771,84 @@ type Mask8x64 struct { vals [64]int8 } -// Mask8x64FromBits constructs a Mask8x64 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 64 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask8x64FromBits(y *uint64) Mask8x64 +// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 64 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask8x64) StoreToBits(y *uint64) + // Mask16x32 is a 512-bit SIMD vector of 32 int16 type Mask16x32 struct { int16x32 v512 vals [32]int16 } -// Mask16x32FromBits constructs a Mask16x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask16x32FromBits(y *uint64) Mask16x32 +// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask16x32) StoreToBits(y *uint64) + // Mask32x16 is a 512-bit SIMD vector of 16 int32 type Mask32x16 struct { int32x16 v512 vals [16]int32 } -// Mask32x16FromBits constructs a Mask32x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask32x16FromBits(y *uint64) Mask32x16 +// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask32x16) StoreToBits(y *uint64) + // Mask64x8 is a 512-bit SIMD vector of 8 int64 type Mask64x8 struct { int64x8 v512 vals [8]int64 } -// Mask64x8FromBits constructs a Mask64x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // +// CPU Features: AVX512 +// //go:noescape func LoadMask64x8FromBits(y *uint64) Mask64x8 + +// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// CPU Features: AVX512 +// +//go:noescape +func (x Mask64x8) StoreToBits(y *uint64) -- 2.52.0