]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile: fix holes in mask peepholes
authorJunyang Shao <shaojunyang@google.com>
Sun, 14 Sep 2025 20:17:55 +0000 (20:17 +0000)
committerJunyang Shao <shaojunyang@google.com>
Mon, 15 Sep 2025 16:40:42 +0000 (09:40 -0700)
It turns out that ".Masked" is implemented by VPANDQ *and* VPANDD.
The shape of bitwise AND doesn't matter, the correctness of the rules is
guaranteed by the way the mask is generated.

This CL fix the holes in the peephole rules.

Change-Id: I2d15c4d17afed6fdbb2f3905a51b2c5c2f673348
Reviewed-on: https://go-review.googlesource.com/c/go/+/703257
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
src/cmd/compile/internal/ssa/_gen/AMD64.rules
src/cmd/compile/internal/ssa/rewriteAMD64.go

index ad84ba755595d718d5a16aa9362a37e73ea8d8f5..a508395825040c0353d314d02416a2436e06c344 100644 (file)
 (VPANDQ512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k)
 (VPANDQ512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k)
 (VPANDQ512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k)
+(VPANDD512 x (VPMOVMToVec64x8 k)) => (VMOVDQU64Masked512 x k)
+(VPANDD512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k)
+(VPANDD512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k)
+(VPANDD512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k)
 
 // Insert to zero of 32/64 bit floats and ints to a zero is just MOVS[SD]
 (VPINSRQ128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSDf2v <types.TypeVec128> y)
index 01227793278952c079e9df2343141f69a6f8597c..187b3ed9d64ae77c34eaa2ac5addcfddab89ed51 100644 (file)
@@ -34681,6 +34681,66 @@ func rewriteValueAMD64_OpAMD64VPADDQMasked512(v *Value) bool {
 func rewriteValueAMD64_OpAMD64VPANDD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
+       // match: (VPANDD512 x (VPMOVMToVec64x8 k))
+       // result: (VMOVDQU64Masked512 x k)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpAMD64VPMOVMToVec64x8 {
+                               continue
+                       }
+                       k := v_1.Args[0]
+                       v.reset(OpAMD64VMOVDQU64Masked512)
+                       v.AddArg2(x, k)
+                       return true
+               }
+               break
+       }
+       // match: (VPANDD512 x (VPMOVMToVec32x16 k))
+       // result: (VMOVDQU32Masked512 x k)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpAMD64VPMOVMToVec32x16 {
+                               continue
+                       }
+                       k := v_1.Args[0]
+                       v.reset(OpAMD64VMOVDQU32Masked512)
+                       v.AddArg2(x, k)
+                       return true
+               }
+               break
+       }
+       // match: (VPANDD512 x (VPMOVMToVec16x32 k))
+       // result: (VMOVDQU16Masked512 x k)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpAMD64VPMOVMToVec16x32 {
+                               continue
+                       }
+                       k := v_1.Args[0]
+                       v.reset(OpAMD64VMOVDQU16Masked512)
+                       v.AddArg2(x, k)
+                       return true
+               }
+               break
+       }
+       // match: (VPANDD512 x (VPMOVMToVec8x64 k))
+       // result: (VMOVDQU8Masked512 x k)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpAMD64VPMOVMToVec8x64 {
+                               continue
+                       }
+                       k := v_1.Args[0]
+                       v.reset(OpAMD64VMOVDQU8Masked512)
+                       v.AddArg2(x, k)
+                       return true
+               }
+               break
+       }
        // match: (VPANDD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
        // result: (VPANDD512load {sym} [off] x ptr mem)