From 08bec02907cf59c3fd60e5c5e31b2d6c30b462b7 Mon Sep 17 00:00:00 2001 From: David Chase Date: Wed, 23 Jul 2025 13:47:08 -0400 Subject: [PATCH] [dev.simd] cmd/compile: add register-to-mask moves, other simd glue This includes code generated by simdgen CL 689955, here because of git-facilitated pilot error (the generated file should have been in the next CL but that is related to this one, so, oh well). Change-Id: Ibfea3f1cd93ca9cd12970edf15a013471677a6ba Reviewed-on: https://go-review.googlesource.com/c/go/+/689936 Reviewed-by: Cherry Mui Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/ssa.go | 8 + src/cmd/compile/internal/ssa/_gen/AMD64.rules | 47 +++- src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 7 + .../compile/internal/ssa/_gen/genericOps.go | 14 + src/cmd/compile/internal/ssa/opGen.go | 128 +++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 264 +++++++++++++++--- src/cmd/compile/internal/ssagen/intrinsics.go | 32 ++- .../compile/internal/ssagen/simdintrinsics.go | 12 + src/simd/types_amd64.go | 48 ++++ 9 files changed, 505 insertions(+), 55 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index efa7895e97..5b2df50b13 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1530,6 +1530,14 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpAMD64KMOVQ, ssa.OpAMD64KMOVD, ssa.OpAMD64KMOVW, ssa.OpAMD64KMOVB: + // See also ssa.OpAMD64KMOVQload + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + default: if !ssaGenSIMDValue(s, v) { v.Fatalf("genValue not implemented: %s", v.LongString()) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 0136e41af7..1195c0de7f 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1682,21 +1682,23 @@ (Select0 a:(ADD(Q|L)constflags [c] x)) && a.Uses == 1 => (ADD(Q|L)const [c] x) // XXX SIMD -(LoadMask8x16 ptr mem) => (VPMOVMToVec8x16 (KMOVQload ptr mem)) -(LoadMask8x32 ptr mem) => (VPMOVMToVec8x32 (KMOVQload ptr mem)) -(LoadMask8x64 ptr mem) => (VPMOVMToVec8x64 (KMOVQload ptr mem)) -(LoadMask16x8 ptr mem) => (VPMOVMToVec16x8 (KMOVQload ptr mem)) -(LoadMask16x16 ptr mem) => (VPMOVMToVec16x16 (KMOVQload ptr mem)) -(LoadMask16x32 ptr mem) => (VPMOVMToVec16x32 (KMOVQload ptr mem)) +// Mask loads +(LoadMask8x16 ptr mem) => (VPMOVMToVec8x16 (KMOVQload ptr mem)) +(LoadMask8x32 ptr mem) => (VPMOVMToVec8x32 (KMOVQload ptr mem)) +(LoadMask8x64 ptr mem) => (VPMOVMToVec8x64 (KMOVQload ptr mem)) -(LoadMask32x4 ptr mem) => (VPMOVMToVec32x4 (KMOVQload ptr mem)) -(LoadMask32x8 ptr mem) => (VPMOVMToVec32x8 (KMOVQload ptr mem)) -(LoadMask32x16 ptr mem) => (VPMOVMToVec32x16 (KMOVQload ptr mem)) +(LoadMask16x8 ptr mem) => (VPMOVMToVec16x8 (KMOVQload ptr mem)) +(LoadMask16x16 ptr mem) => (VPMOVMToVec16x16 (KMOVQload ptr mem)) +(LoadMask16x32 ptr mem) => (VPMOVMToVec16x32 (KMOVQload ptr mem)) -(LoadMask64x2 ptr mem) => (VPMOVMToVec64x2 (KMOVQload ptr mem)) -(LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) -(LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) +(LoadMask32x4 ptr mem) => (VPMOVMToVec32x4 (KMOVQload ptr mem)) +(LoadMask32x8 ptr mem) => (VPMOVMToVec32x8 (KMOVQload ptr mem)) +(LoadMask32x16 ptr mem) => (VPMOVMToVec32x16 (KMOVQload ptr mem)) + +(LoadMask64x2 ptr mem) => (VPMOVMToVec64x2 (KMOVQload ptr mem)) +(LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) +(LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) (StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM val) mem) (StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM val) mem) @@ -1714,6 +1716,26 @@ (StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM val) mem) (StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM val) mem) +// TODO is this correct? Should we just do it all from 64-bits? + +// Mask conversions (from integers) +(Cvt16toMask8x16 x) => (VPMOVMToVec8x16 (KMOVW x)) +(Cvt32toMask8x32 x) => (VPMOVMToVec8x32 (KMOVD x)) +(Cvt64toMask8x64 x) => (VPMOVMToVec8x64 (KMOVQ x)) + +(Cvt8toMask16x8 x) => (VPMOVMToVec16x8 (KMOVB x)) +(Cvt16toMask16x16 x) => (VPMOVMToVec16x16 (KMOVW x)) +(Cvt32toMask16x32 x) => (VPMOVMToVec16x32 (KMOVD x)) + +(Cvt8toMask32x4 x) => (VPMOVMToVec32x4 (KMOVB x)) +(Cvt8toMask32x8 x) => (VPMOVMToVec32x8 (KMOVB x)) +(Cvt16toMask32x16 x) => (VPMOVMToVec32x16 (KMOVW x)) + +(Cvt8toMask64x2 x) => (VPMOVMToVec64x2 (KMOVB x)) +(Cvt8toMask64x4 x) => (VPMOVMToVec64x4 (KMOVB x)) +(Cvt8toMask64x8 x) => (VPMOVMToVec64x8 (KMOVB x)) + +// SIMD vector loads and stores (Load ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem) @@ -1723,6 +1745,7 @@ (Load ptr mem) && t.Size() == 64 => (VMOVDQUload512 ptr mem) (Store {t} ptr val mem) && t.Size() == 64 => (VMOVDQUstore512 ptr val mem) +// SIMD vector integer-vector-masked loads and stores. (LoadMasked32 ptr mask mem) && t.Size() == 16 => (VPMASK32load128 ptr mask mem) (LoadMasked32 ptr mask mem) && t.Size() == 32 => (VPMASK32load256 ptr mask mem) (LoadMasked64 ptr mask mem) && t.Size() == 16 => (VPMASK64load128 ptr mask mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 66c37a495f..8ab0b82351 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -242,6 +242,7 @@ func init() { kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}} + gpk = regInfo{inputs: gponly, outputs: maskonly} prefreg = regInfo{inputs: []regMask{gpspsbg}} ) @@ -1337,6 +1338,12 @@ func init() { {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, {name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, + + // Move GP directly to mask register + {name: "KMOVQ", argLength: 1, reg: gpk, asm: "KMOVQ"}, + {name: "KMOVD", argLength: 1, reg: gpk, asm: "KMOVD"}, + {name: "KMOVW", argLength: 1, reg: gpk, asm: "KMOVW"}, + {name: "KMOVB", argLength: 1, reg: gpk, asm: "KMOVB"}, } var AMD64blocks = []blockData{ diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index c1383199c4..e714e347e2 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -699,6 +699,20 @@ var genericOps = []opData{ {name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. {name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. {name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + + // Convert integers to masks + {name: "Cvt16toMask8x16", argLength: 1}, // arg0 = integer mask value + {name: "Cvt32toMask8x32", argLength: 1}, // arg0 = integer mask value + {name: "Cvt64toMask8x64", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask16x8", argLength: 1}, // arg0 = integer mask value + {name: "Cvt16toMask16x16", argLength: 1}, // arg0 = integer mask value + {name: "Cvt32toMask16x32", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask32x4", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask32x8", argLength: 1}, // arg0 = integer mask value + {name: "Cvt16toMask32x16", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask64x2", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask64x4", argLength: 1}, // arg0 = integer mask value + {name: "Cvt8toMask64x8", argLength: 1}, // arg0 = integer mask value } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index b9dc41e860..61ce06203a 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1208,6 +1208,10 @@ const ( OpAMD64VZEROALL OpAMD64KMOVQload OpAMD64KMOVQstore + OpAMD64KMOVQ + OpAMD64KMOVD + OpAMD64KMOVW + OpAMD64KMOVB OpAMD64VADDPD128 OpAMD64VADDPD256 OpAMD64VADDPD512 @@ -4461,6 +4465,18 @@ const ( OpStoreMask64x2 OpStoreMask64x4 OpStoreMask64x8 + OpCvt16toMask8x16 + OpCvt32toMask8x32 + OpCvt64toMask8x64 + OpCvt8toMask16x8 + OpCvt16toMask16x16 + OpCvt32toMask16x32 + OpCvt8toMask32x4 + OpCvt8toMask32x8 + OpCvt16toMask32x16 + OpCvt8toMask64x2 + OpCvt8toMask64x4 + OpCvt8toMask64x8 OpAbsoluteInt8x16 OpAbsoluteInt8x32 OpAbsoluteInt8x64 @@ -19029,6 +19045,58 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "KMOVQ", + argLen: 1, + asm: x86.AKMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "KMOVD", + argLen: 1, + asm: x86.AKMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "KMOVW", + argLen: 1, + asm: x86.AKMOVW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "KMOVB", + argLen: 1, + asm: x86.AKMOVB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VADDPD128", argLen: 2, @@ -61379,6 +61447,66 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "Cvt16toMask8x16", + argLen: 1, + generic: true, + }, + { + name: "Cvt32toMask8x32", + argLen: 1, + generic: true, + }, + { + name: "Cvt64toMask8x64", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask16x8", + argLen: 1, + generic: true, + }, + { + name: "Cvt16toMask16x16", + argLen: 1, + generic: true, + }, + { + name: "Cvt32toMask16x32", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask32x4", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask32x8", + argLen: 1, + generic: true, + }, + { + name: "Cvt16toMask32x16", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask64x2", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask64x4", + argLen: 1, + generic: true, + }, + { + name: "Cvt8toMask64x8", + argLen: 1, + generic: true, + }, { name: "AbsoluteInt8x16", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 11c7c20db2..d79c856ae8 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1313,6 +1313,12 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpCtz8(v) case OpCtz8NonZero: return rewriteValueAMD64_OpCtz8NonZero(v) + case OpCvt16toMask16x16: + return rewriteValueAMD64_OpCvt16toMask16x16(v) + case OpCvt16toMask32x16: + return rewriteValueAMD64_OpCvt16toMask32x16(v) + case OpCvt16toMask8x16: + return rewriteValueAMD64_OpCvt16toMask8x16(v) case OpCvt32Fto32: v.Op = OpAMD64CVTTSS2SL return true @@ -1328,6 +1334,10 @@ func rewriteValueAMD64(v *Value) bool { case OpCvt32to64F: v.Op = OpAMD64CVTSL2SD return true + case OpCvt32toMask16x32: + return rewriteValueAMD64_OpCvt32toMask16x32(v) + case OpCvt32toMask8x32: + return rewriteValueAMD64_OpCvt32toMask8x32(v) case OpCvt64Fto32: v.Op = OpAMD64CVTTSD2SL return true @@ -1343,6 +1353,20 @@ func rewriteValueAMD64(v *Value) bool { case OpCvt64to64F: v.Op = OpAMD64CVTSQ2SD return true + case OpCvt64toMask8x64: + return rewriteValueAMD64_OpCvt64toMask8x64(v) + case OpCvt8toMask16x8: + return rewriteValueAMD64_OpCvt8toMask16x8(v) + case OpCvt8toMask32x4: + return rewriteValueAMD64_OpCvt8toMask32x4(v) + case OpCvt8toMask32x8: + return rewriteValueAMD64_OpCvt8toMask32x8(v) + case OpCvt8toMask64x2: + return rewriteValueAMD64_OpCvt8toMask64x2(v) + case OpCvt8toMask64x4: + return rewriteValueAMD64_OpCvt8toMask64x4(v) + case OpCvt8toMask64x8: + return rewriteValueAMD64_OpCvt8toMask64x8(v) case OpCvtBoolToUint8: v.Op = OpCopy return true @@ -32276,6 +32300,186 @@ func rewriteValueAMD64_OpCtz8NonZero(v *Value) bool { } return false } +func rewriteValueAMD64_OpCvt16toMask16x16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt16toMask16x16 x) + // result: (VPMOVMToVec16x16 (KMOVW x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec16x16) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt16toMask32x16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt16toMask32x16 x) + // result: (VPMOVMToVec32x16 (KMOVW x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec32x16) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt16toMask8x16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt16toMask8x16 x) + // result: (VPMOVMToVec8x16 (KMOVW x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec8x16) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt32toMask16x32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt32toMask16x32 x) + // result: (VPMOVMToVec16x32 (KMOVD x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec16x32) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVD, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt32toMask8x32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt32toMask8x32 x) + // result: (VPMOVMToVec8x32 (KMOVD x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec8x32) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVD, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt64toMask8x64(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt64toMask8x64 x) + // result: (VPMOVMToVec8x64 (KMOVQ x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec8x64) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQ, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask16x8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask16x8 x) + // result: (VPMOVMToVec16x8 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec16x8) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask32x4(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask32x4 x) + // result: (VPMOVMToVec32x4 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec32x4) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask32x8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask32x8 x) + // result: (VPMOVMToVec32x8 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec32x8) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask64x2(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask64x2 x) + // result: (VPMOVMToVec64x2 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec64x2) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask64x4(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask64x4 x) + // result: (VPMOVMToVec64x4 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec64x4) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (Cvt8toMask64x8 x) + // result: (VPMOVMToVec64x8 (KMOVB x)) + for { + x := v_0 + v.reset(OpAMD64VPMOVMToVec64x8) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v *Value) bool { v_0 := v.Args[0] // match: (DiffWithCeilWithPrecisionFloat32x16 [a] x) @@ -40478,14 +40682,13 @@ func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x16 ptr mem) - // result: (VPMOVMToVec16x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x16 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x16) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40496,14 +40699,13 @@ func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x32 ptr mem) - // result: (VPMOVMToVec16x32 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x32 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x32) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40514,14 +40716,13 @@ func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x8 ptr mem) - // result: (VPMOVMToVec16x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x8 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x8) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40532,14 +40733,13 @@ func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x16 ptr mem) - // result: (VPMOVMToVec32x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x16 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x16) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40550,14 +40750,13 @@ func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x4 ptr mem) - // result: (VPMOVMToVec32x4 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x4 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x4) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40568,14 +40767,13 @@ func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x8 ptr mem) - // result: (VPMOVMToVec32x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x8 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x8) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40586,14 +40784,13 @@ func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x2 ptr mem) - // result: (VPMOVMToVec64x2 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x2 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x2) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40604,14 +40801,13 @@ func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x4 ptr mem) - // result: (VPMOVMToVec64x4 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x4 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x4) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40622,14 +40818,13 @@ func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x8 ptr mem) - // result: (VPMOVMToVec64x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x8 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x8) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40640,14 +40835,13 @@ func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x16 ptr mem) - // result: (VPMOVMToVec8x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x16 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x16) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40658,14 +40852,13 @@ func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x32 ptr mem) - // result: (VPMOVMToVec8x32 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x32 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x32) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -40676,14 +40869,13 @@ func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x64 ptr mem) - // result: (VPMOVMToVec8x64 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x64 (KMOVQload ptr mem)) for { - t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x64) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) v0.AddArg2(ptr, mem) v.AddArg(v0) return true diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 7326ae2485..d7b25f2ab1 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1775,15 +1775,23 @@ func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { } } +var loadMaskOpcodes = map[int]map[int]ssa.Op{ + 8: {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64}, + 16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32}, + 32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16}, + 64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8}, +} + +var cvtMaskOpcodes = map[int]map[int]ssa.Op{ + 8: {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64}, + 16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32}, + 32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16}, + 64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8}, +} + func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - opCodes := map[int]map[int]ssa.Op{ - 8: {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64}, - 16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32}, - 32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16}, - 64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8}, - } - op := opCodes[elemBits][lanes] + op := loadMaskOpcodes[elemBits][lanes] if op == 0 { panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) } @@ -1808,6 +1816,16 @@ func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*s } } +func simdCvtMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + op := cvtMaskOpcodes[elemBits][lanes] + if op == 0 { + panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) + } + return s.newValue1(op, types.TypeMask, args[0]) + } +} + func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(op, n.Type(), args[0], args[1], s.mem()) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index a8a2ff9142..dddfab5b71 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -2174,70 +2174,82 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64) addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64) + addF(simdPackage, "Mask8x16FromBits", simdCvtMask(8, 16), sys.AMD64) addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64) addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64) + addF(simdPackage, "Mask8x32FromBits", simdCvtMask(8, 32), sys.AMD64) addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64) addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64) + addF(simdPackage, "Mask8x64FromBits", simdCvtMask(8, 64), sys.AMD64) addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64) addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64) + addF(simdPackage, "Mask16x8FromBits", simdCvtMask(16, 8), sys.AMD64) addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64) addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64) + addF(simdPackage, "Mask16x16FromBits", simdCvtMask(16, 16), sys.AMD64) addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64) addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64) + addF(simdPackage, "Mask16x32FromBits", simdCvtMask(16, 32), sys.AMD64) addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64) addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64) + addF(simdPackage, "Mask32x4FromBits", simdCvtMask(32, 4), sys.AMD64) addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64) addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64) + addF(simdPackage, "Mask32x8FromBits", simdCvtMask(32, 8), sys.AMD64) addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64) addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64) + addF(simdPackage, "Mask32x16FromBits", simdCvtMask(32, 16), sys.AMD64) addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64) addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64) + addF(simdPackage, "Mask64x2FromBits", simdCvtMask(64, 2), sys.AMD64) addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64) addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64) + addF(simdPackage, "Mask64x4FromBits", simdCvtMask(64, 4), sys.AMD64) addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64) addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64) + addF(simdPackage, "Mask64x8FromBits", simdCvtMask(64, 8), sys.AMD64) } diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index c1676ff34e..252da021e2 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -293,6 +293,10 @@ func LoadMask8x16FromBits(y *uint64) Mask8x16 //go:noescape func (x Mask8x16) StoreToBits(y *uint64) +// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +func Mask8x16FromBits(y uint16) Mask8x16 + // Mask16x8 is a 128-bit SIMD vector of 8 int16 type Mask16x8 struct { int16x8 v128 @@ -315,6 +319,10 @@ func LoadMask16x8FromBits(y *uint64) Mask16x8 //go:noescape func (x Mask16x8) StoreToBits(y *uint64) +// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +func Mask16x8FromBits(y uint8) Mask16x8 + // Mask32x4 is a 128-bit SIMD vector of 4 int32 type Mask32x4 struct { int32x4 v128 @@ -337,6 +345,10 @@ func LoadMask32x4FromBits(y *uint64) Mask32x4 //go:noescape func (x Mask32x4) StoreToBits(y *uint64) +// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +func Mask32x4FromBits(y uint8) Mask32x4 + // Mask64x2 is a 128-bit SIMD vector of 2 int64 type Mask64x2 struct { int64x2 v128 @@ -359,6 +371,10 @@ func LoadMask64x2FromBits(y *uint64) Mask64x2 //go:noescape func (x Mask64x2) StoreToBits(y *uint64) +// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 2 bits of y are used. +func Mask64x2FromBits(y uint8) Mask64x2 + // v256 is a tag type that tells the compiler that this is really 256-bit SIMD type v256 struct { _256 struct{} @@ -648,6 +664,10 @@ func LoadMask8x32FromBits(y *uint64) Mask8x32 //go:noescape func (x Mask8x32) StoreToBits(y *uint64) +// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +func Mask8x32FromBits(y uint32) Mask8x32 + // Mask16x16 is a 256-bit SIMD vector of 16 int16 type Mask16x16 struct { int16x16 v256 @@ -670,6 +690,10 @@ func LoadMask16x16FromBits(y *uint64) Mask16x16 //go:noescape func (x Mask16x16) StoreToBits(y *uint64) +// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +func Mask16x16FromBits(y uint16) Mask16x16 + // Mask32x8 is a 256-bit SIMD vector of 8 int32 type Mask32x8 struct { int32x8 v256 @@ -692,6 +716,10 @@ func LoadMask32x8FromBits(y *uint64) Mask32x8 //go:noescape func (x Mask32x8) StoreToBits(y *uint64) +// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +func Mask32x8FromBits(y uint8) Mask32x8 + // Mask64x4 is a 256-bit SIMD vector of 4 int64 type Mask64x4 struct { int64x4 v256 @@ -714,6 +742,10 @@ func LoadMask64x4FromBits(y *uint64) Mask64x4 //go:noescape func (x Mask64x4) StoreToBits(y *uint64) +// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +func Mask64x4FromBits(y uint8) Mask64x4 + // v512 is a tag type that tells the compiler that this is really 512-bit SIMD type v512 struct { _512 struct{} @@ -931,6 +963,10 @@ func LoadMask8x64FromBits(y *uint64) Mask8x64 //go:noescape func (x Mask8x64) StoreToBits(y *uint64) +// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 64 bits of y are used. +func Mask8x64FromBits(y uint64) Mask8x64 + // Mask16x32 is a 512-bit SIMD vector of 32 int16 type Mask16x32 struct { int16x32 v512 @@ -953,6 +989,10 @@ func LoadMask16x32FromBits(y *uint64) Mask16x32 //go:noescape func (x Mask16x32) StoreToBits(y *uint64) +// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +func Mask16x32FromBits(y uint32) Mask16x32 + // Mask32x16 is a 512-bit SIMD vector of 16 int32 type Mask32x16 struct { int32x16 v512 @@ -975,6 +1015,10 @@ func LoadMask32x16FromBits(y *uint64) Mask32x16 //go:noescape func (x Mask32x16) StoreToBits(y *uint64) +// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +func Mask32x16FromBits(y uint16) Mask32x16 + // Mask64x8 is a 512-bit SIMD vector of 8 int64 type Mask64x8 struct { int64x8 v512 @@ -996,3 +1040,7 @@ func LoadMask64x8FromBits(y *uint64) Mask64x8 // //go:noescape func (x Mask64x8) StoreToBits(y *uint64) + +// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +func Mask64x8FromBits(y uint8) Mask64x8 -- 2.52.0