From e34ad6de42d32c6be78e0c31780977cca3ddc9f4 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 16 Sep 2025 17:27:36 +0000 Subject: [PATCH] [dev.simd] cmd/compile: optimize VPTEST for 2-operand cases Change-Id: Ica2d5ee48082c69e86b12b519ba8df7a2556392f Reviewed-on: https://go-review.googlesource.com/c/go/+/704355 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 10 + src/cmd/compile/internal/ssa/rewriteAMD64.go | 368 ++++++++++++++++++ test/codegen/simd.go | 29 ++ 3 files changed, 407 insertions(+) create mode 100644 test/codegen/simd.go diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 3c73737dc0..05fc64d486 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1802,3 +1802,13 @@ (VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem) (VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem) (VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem) + +// 2-op VPTEST optimizations +(SETEQ (VPTEST x:(VPAND(128|256) j k) y)) && x == y && x.Uses == 2 => (SETEQ (VPTEST j k)) +(SETEQ (VPTEST x:(VPAND(D|Q)512 j k) y)) && x == y && x.Uses == 2 => (SETEQ (VPTEST j k)) +(SETEQ (VPTEST x:(VPANDN(128|256) j k) y)) && x == y && x.Uses == 2 => (SETB (VPTEST k j)) // AndNot has swapped its operand order +(SETEQ (VPTEST x:(VPANDN(D|Q)512 j k) y)) && x == y && x.Uses == 2 => (SETB (VPTEST k j)) // AndNot has swapped its operand order +(EQ (VPTEST x:(VPAND(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (EQ (VPTEST j k) yes no) +(EQ (VPTEST x:(VPAND(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (EQ (VPTEST j k) yes no) +(EQ (VPTEST x:(VPANDN(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order +(EQ (VPTEST x:(VPANDN(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 70b6d549fb..26a06fc3fc 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -22607,6 +22607,190 @@ func rewriteValueAMD64_OpAMD64SETEQ(v *Value) bool { } break } + // match: (SETEQ (VPTEST x:(VPAND128 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETEQ (VPTEST j k)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPAND128 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETEQ) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPAND256 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETEQ (VPTEST j k)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPAND256 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETEQ) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDD512 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETEQ (VPTEST j k)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDD512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETEQ) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDQ512 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETEQ (VPTEST j k)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDQ512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETEQ) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDN128 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETB (VPTEST k j)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDN128 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETB) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDN256 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETB (VPTEST k j)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDN256 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETB) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDND512 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETB (VPTEST k j)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDND512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETB) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + v.AddArg(v0) + return true + } + // match: (SETEQ (VPTEST x:(VPANDNQ512 j k) y)) + // cond: x == y && x.Uses == 2 + // result: (SETB (VPTEST k j)) + for { + if v_0.Op != OpAMD64VPTEST { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDNQ512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v.reset(OpAMD64SETB) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + v.AddArg(v0) + return true + } return false } func rewriteValueAMD64_OpAMD64SETEQstore(v *Value) bool { @@ -61066,6 +61250,190 @@ func rewriteBlockAMD64(b *Block) bool { } break } + // match: (EQ (VPTEST x:(VPAND128 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (EQ (VPTEST j k) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPAND128 { + break + } + _ = x.Args[1] + x_0 := x.Args[0] + x_1 := x.Args[1] + for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 { + j := x_0 + k := x_1 + if !(x == y && x.Uses == 2) { + continue + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + b.resetWithControl(BlockAMD64EQ, v0) + return true + } + break + } + // match: (EQ (VPTEST x:(VPAND256 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (EQ (VPTEST j k) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPAND256 { + break + } + _ = x.Args[1] + x_0 := x.Args[0] + x_1 := x.Args[1] + for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 { + j := x_0 + k := x_1 + if !(x == y && x.Uses == 2) { + continue + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + b.resetWithControl(BlockAMD64EQ, v0) + return true + } + break + } + // match: (EQ (VPTEST x:(VPANDD512 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (EQ (VPTEST j k) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDD512 { + break + } + _ = x.Args[1] + x_0 := x.Args[0] + x_1 := x.Args[1] + for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 { + j := x_0 + k := x_1 + if !(x == y && x.Uses == 2) { + continue + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + b.resetWithControl(BlockAMD64EQ, v0) + return true + } + break + } + // match: (EQ (VPTEST x:(VPANDQ512 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (EQ (VPTEST j k) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDQ512 { + break + } + _ = x.Args[1] + x_0 := x.Args[0] + x_1 := x.Args[1] + for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 { + j := x_0 + k := x_1 + if !(x == y && x.Uses == 2) { + continue + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(j, k) + b.resetWithControl(BlockAMD64EQ, v0) + return true + } + break + } + // match: (EQ (VPTEST x:(VPANDN128 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (ULT (VPTEST k j) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDN128 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + b.resetWithControl(BlockAMD64ULT, v0) + return true + } + // match: (EQ (VPTEST x:(VPANDN256 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (ULT (VPTEST k j) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDN256 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + b.resetWithControl(BlockAMD64ULT, v0) + return true + } + // match: (EQ (VPTEST x:(VPANDND512 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (ULT (VPTEST k j) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDND512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + b.resetWithControl(BlockAMD64ULT, v0) + return true + } + // match: (EQ (VPTEST x:(VPANDNQ512 j k) y) yes no) + // cond: x == y && x.Uses == 2 + // result: (ULT (VPTEST k j) yes no) + for b.Controls[0].Op == OpAMD64VPTEST { + v_0 := b.Controls[0] + y := v_0.Args[1] + x := v_0.Args[0] + if x.Op != OpAMD64VPANDNQ512 { + break + } + k := x.Args[1] + j := x.Args[0] + if !(x == y && x.Uses == 2) { + break + } + v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(k, j) + b.resetWithControl(BlockAMD64ULT, v0) + return true + } case BlockAMD64GE: // match: (GE c:(CMPQconst [128] z) yes no) // cond: c.Uses == 1 diff --git a/test/codegen/simd.go b/test/codegen/simd.go new file mode 100644 index 0000000000..0d617bfc46 --- /dev/null +++ b/test/codegen/simd.go @@ -0,0 +1,29 @@ +// asmcheck + +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// These tests check code generation of simd peephole optimizations. + +//go:build goexperiment.simd + +package codegen + +import "simd" + +func vptest1() bool { + v1 := simd.LoadUint64x2Slice([]uint64{0, 1}) + v2 := simd.LoadUint64x2Slice([]uint64{0, 0}) + // amd64:`VPTEST\s(.*)(.*)$` + // amd64:`SETCS\s(.*)$` + return v1.AndNot(v2).IsZero() +} + +func vptest2() bool { + v1 := simd.LoadUint64x2Slice([]uint64{0, 1}) + v2 := simd.LoadUint64x2Slice([]uint64{0, 0}) + // amd64:`VPTEST\s(.*)(.*)$` + // amd64:`SETEQ\s(.*)$` + return v1.And(v2).IsZero() +} -- 2.52.0