]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile: peephole simd mask load/stores from bits
authorJunyang Shao <shaojunyang@google.com>
Fri, 10 Oct 2025 19:10:31 +0000 (19:10 +0000)
committerJunyang Shao <shaojunyang@google.com>
Tue, 14 Oct 2025 19:26:41 +0000 (12:26 -0700)
The added test are manually checked that the peepholes are triggered.

Change-Id: Ibd29eac449869b52c2376f9eafd83410b5266890
Reviewed-on: https://go-review.googlesource.com/c/go/+/710916
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/_gen/AMD64.rules
src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/simd/internal/simd_test/simd_test.go

index 0159d8ec07ad5fadcd68ed40510c13a0fa450b53..25fa7b695a2f3d188a1a4e5a5fb17e578890680a 100644 (file)
@@ -1755,14 +1755,16 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.To.Type = obj.TYPE_REG
                p.To.Reg = simdReg(v)
 
-       case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload:
+       case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
+               ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_MEM
                p.From.Reg = v.Args[0].Reg()
                ssagen.AddAux(&p.From, v)
                p.To.Type = obj.TYPE_REG
                p.To.Reg = simdOrMaskReg(v)
-       case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore:
+       case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
+               ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_REG
                p.From.Reg = simdOrMaskReg(v.Args[1])
index 2b4487196031e04c81dca1e0f01e12cd7e1bfe1b..30c31eb865ee7f9b17139941b6e42d48057380c9 100644 (file)
 (CvtMask64x4to8 <t> x) => (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
 (CvtMask64x8to8 <t> x) => (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
 
+// optimizations
+(MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem)
+(MOVWstore [off] {sym} ptr (KMOVWi mask) mem) => (KMOVWstore [off] {sym} ptr mask mem)
+(MOVLstore [off] {sym} ptr (KMOVDi mask) mem) => (KMOVDstore [off] {sym} ptr mask mem)
+(MOVQstore [off] {sym} ptr (KMOVQi mask) mem) => (KMOVQstore [off] {sym} ptr mask mem)
+
+(KMOVBk l:(MOVBload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVBload [off] {sym} ptr mem)
+(KMOVWk l:(MOVWload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVWload [off] {sym} ptr mem)
+(KMOVDk l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVDload [off] {sym} ptr mem)
+(KMOVQk l:(MOVQload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVQload [off] {sym} ptr mem)
+
 // SIMD vector loads and stores
 (Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
 (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
index 027b9832ac21f7ea20e2dc205366663b26641f64..c92f1b8531e8c1ab4cbce4ef475c411863287bfd 100644 (file)
@@ -1415,7 +1415,20 @@ func init() {
                {name: "VZEROUPPER", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROUPPER"}, // arg=mem, returns mem
                {name: "VZEROALL", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROALL"},     // arg=mem, returns mem
 
+               // KMOVxload: loads masks
+               // Load (Q=8,D=4,W=2,B=1) bytes from (arg0+auxint+aux), arg1=mem.
+               // "+auxint+aux" == add auxint and the offset of the symbol in aux (if any) to the effective address
+               {name: "KMOVBload", argLength: 2, reg: kload, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+               {name: "KMOVWload", argLength: 2, reg: kload, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+               {name: "KMOVDload", argLength: 2, reg: kload, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
                {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+               // KMOVxstore: stores masks
+               // Store (Q=8,D=4,W=2,B=1) low bytes of arg1.
+               // Does *(arg0+auxint+aux) = arg1, arg2=mem.
+               {name: "KMOVBstore", argLength: 3, reg: kstore, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+               {name: "KMOVWstore", argLength: 3, reg: kstore, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+               {name: "KMOVDstore", argLength: 3, reg: kstore, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
                {name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
 
                // Move GP directly to mask register
index 08b6bffd0ef20e65104cdc31372ca3fdaa19e14b..30831e828a811c1c88bf05ebbe009da6c3fde02d 100644 (file)
@@ -1228,7 +1228,13 @@ const (
        OpAMD64VMOVSDconst
        OpAMD64VZEROUPPER
        OpAMD64VZEROALL
+       OpAMD64KMOVBload
+       OpAMD64KMOVWload
+       OpAMD64KMOVDload
        OpAMD64KMOVQload
+       OpAMD64KMOVBstore
+       OpAMD64KMOVWstore
+       OpAMD64KMOVDstore
        OpAMD64KMOVQstore
        OpAMD64KMOVQk
        OpAMD64KMOVDk
@@ -19698,6 +19704,54 @@ var opcodeTable = [...]opInfo{
                        clobbers: 2147418112, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
                },
        },
+       {
+               name:           "KMOVBload",
+               auxType:        auxSymOff,
+               argLen:         2,
+               faultOnNilArg0: true,
+               symEffect:      SymRead,
+               asm:            x86.AKMOVB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                       },
+               },
+       },
+       {
+               name:           "KMOVWload",
+               auxType:        auxSymOff,
+               argLen:         2,
+               faultOnNilArg0: true,
+               symEffect:      SymRead,
+               asm:            x86.AKMOVW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                       },
+               },
+       },
+       {
+               name:           "KMOVDload",
+               auxType:        auxSymOff,
+               argLen:         2,
+               faultOnNilArg0: true,
+               symEffect:      SymRead,
+               asm:            x86.AKMOVD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                       },
+               },
+       },
        {
                name:           "KMOVQload",
                auxType:        auxSymOff,
@@ -19714,6 +19768,48 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "KMOVBstore",
+               auxType:        auxSymOff,
+               argLen:         3,
+               faultOnNilArg0: true,
+               symEffect:      SymWrite,
+               asm:            x86.AKMOVB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+               },
+       },
+       {
+               name:           "KMOVWstore",
+               auxType:        auxSymOff,
+               argLen:         3,
+               faultOnNilArg0: true,
+               symEffect:      SymWrite,
+               asm:            x86.AKMOVW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+               },
+       },
+       {
+               name:           "KMOVDstore",
+               auxType:        auxSymOff,
+               argLen:         3,
+               faultOnNilArg0: true,
+               symEffect:      SymWrite,
+               asm:            x86.AKMOVD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+                       },
+               },
+       },
        {
                name:           "KMOVQstore",
                auxType:        auxSymOff,
index 5220a0a73c2dbe5e2328caa184ad665a1f0c60c1..908fd71b783a1a3979b9faab7d03b6e94826cd1e 100644 (file)
@@ -225,6 +225,14 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64HMULQ(v)
        case OpAMD64HMULQU:
                return rewriteValueAMD64_OpAMD64HMULQU(v)
+       case OpAMD64KMOVBk:
+               return rewriteValueAMD64_OpAMD64KMOVBk(v)
+       case OpAMD64KMOVDk:
+               return rewriteValueAMD64_OpAMD64KMOVDk(v)
+       case OpAMD64KMOVQk:
+               return rewriteValueAMD64_OpAMD64KMOVQk(v)
+       case OpAMD64KMOVWk:
+               return rewriteValueAMD64_OpAMD64KMOVWk(v)
        case OpAMD64LEAL:
                return rewriteValueAMD64_OpAMD64LEAL(v)
        case OpAMD64LEAL1:
@@ -13351,6 +13359,106 @@ func rewriteValueAMD64_OpAMD64HMULQU(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpAMD64KMOVBk(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (KMOVBk l:(MOVBload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (KMOVBload [off] {sym} ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64MOVBload {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64KMOVBload)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64KMOVDk(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (KMOVDk l:(MOVLload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (KMOVDload [off] {sym} ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64MOVLload {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64KMOVDload)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64KMOVQk(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (KMOVQk l:(MOVQload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (KMOVQload [off] {sym} ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64MOVQload {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64KMOVQload)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64KMOVWk(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (KMOVWk l:(MOVWload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (KMOVWload [off] {sym} ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64MOVWload {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64KMOVWload)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
 func rewriteValueAMD64_OpAMD64LEAL(v *Value) bool {
        v_0 := v.Args[0]
        // match: (LEAL [c] {s} (ADDLconst [d] x))
@@ -15447,6 +15555,23 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.AddArg3(base, val, mem)
                return true
        }
+       // match: (MOVBstore [off] {sym} ptr (KMOVBi mask) mem)
+       // result: (KMOVBstore [off] {sym} ptr mask mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpAMD64KMOVBi {
+                       break
+               }
+               mask := v_1.Args[0]
+               mem := v_2
+               v.reset(OpAMD64KMOVBstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool {
@@ -16477,6 +16602,23 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool {
                v.AddArg3(p, w, mem)
                return true
        }
+       // match: (MOVLstore [off] {sym} ptr (KMOVDi mask) mem)
+       // result: (KMOVDstore [off] {sym} ptr mask mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpAMD64KMOVDi {
+                       break
+               }
+               mask := v_1.Args[0]
+               mem := v_2
+               v.reset(OpAMD64KMOVDstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
@@ -17460,6 +17602,23 @@ func rewriteValueAMD64_OpAMD64MOVQstore(v *Value) bool {
                v.AddArg3(p, w, mem)
                return true
        }
+       // match: (MOVQstore [off] {sym} ptr (KMOVQi mask) mem)
+       // result: (KMOVQstore [off] {sym} ptr mask mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpAMD64KMOVQi {
+                       break
+               }
+               mask := v_1.Args[0]
+               mem := v_2
+               v.reset(OpAMD64KMOVQstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
@@ -18386,6 +18545,23 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                v.AddArg3(p, w, mem)
                return true
        }
+       // match: (MOVWstore [off] {sym} ptr (KMOVWi mask) mem)
+       // result: (KMOVWstore [off] {sym} ptr mask mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpAMD64KMOVWi {
+                       break
+               }
+               mask := v_1.Args[0]
+               mem := v_2
+               v.reset(OpAMD64KMOVWstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool {
index 2c866ad68b31920efc438684e04f3e404ef00c4e..422378eebe44d8def1c9093ca5c52f257ef6195d 100644 (file)
@@ -348,6 +348,24 @@ func TestBitMaskFromBits(t *testing.T) {
        }
 }
 
+var maskForTestBitMaskFromBitsLoad = uint8(0b10)
+
+func TestBitMaskFromBitsLoad(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+       results := [2]int64{}
+       want := [2]int64{0, 6}
+       m := simd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
+       simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+       for i := range 2 {
+               if results[i] != want[i] {
+                       t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+               }
+       }
+}
+
 func TestBitMaskToBits(t *testing.T) {
        if !simd.HasAVX512() {
                t.Skip("Test requires HasAVX512, not available on this hardware")
@@ -358,6 +376,19 @@ func TestBitMaskToBits(t *testing.T) {
        }
 }
 
+var maskForTestBitMaskFromBitsStore uint8
+
+func TestBitMaskToBitsStore(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+       maskForTestBitMaskFromBitsStore = simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
+       if maskForTestBitMaskFromBitsStore != 0b101 {
+               t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
+       }
+}
+
 func TestMergeFloat(t *testing.T) {
        k := make([]int64, 4, 4)
        s := make([]float64, 4, 4)