From c7258178cda55e3903ad8e4b6cc51861fb689457 Mon Sep 17 00:00:00 2001 From: Alexander Musman Date: Sun, 13 Jul 2025 11:09:58 +0300 Subject: [PATCH] cmd/compile: optimize small constant-sized MemEq MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add optimization patterns for MemEq with small constant sizes (3-32 bytes). These patterns help to avoid runtime calls for small sizes. For sizes 3-16, combine two chunks loading and comparison. For sizes 17-32, combine a 16-byte comparison with the remaining bytes. This change may increase binary size slightly due to inline expansion, but improves performance for code with many small memequals, e.g. DecodehealingTracker benchmark on arm64: shortname: minio pkg: github.com/minio/minio/cmd │ Orig.res │ Uexp.res │ │ sec/op │ sec/op vs base │ DecodehealingTracker-4 842.5n ± 1% 794.0n ± 3% -5.75% (p=0.000 n=10) AppendMsgResyncTargetsInfo-4 8.472n ± 0% 8.472n ± 0% ~ (p=0.582 n=10) DataUpdateTracker-4 2.856µ ± 2% 2.804µ ± 3% ~ (p=0.210 n=10) MarshalMsgdataUsageCacheInfo-4 131.2n ± 1% 131.6n ± 2% ~ (p=0.494 n=10) geomean 227.4n 223.2n -1.86% │ Orig.res │ Uexp.res │ │ B/s │ B/s vs base │ DecodehealingTracker-4 352.0Mi ± 1% 373.5Mi ± 3% +6.10% (p=0.000 n=10) AppendMsgResyncTargetsInfo-4 1.099Gi ± 0% 1.099Gi ± 0% ~ (p=0.183 n=10) DataUpdateTracker-4 341.8Ki ± 3% 351.6Ki ± 3% ~ (p=0.286 n=10) geomean 50.95Mi 52.46Mi +2.96% Change-Id: If3d7e7395656d5f36e3ab303a71044293d17bc3e Reviewed-on: https://go-review.googlesource.com/c/go/+/688195 Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: Keith Randall --- .../compile/internal/ssa/_gen/generic.rules | 37 ++++ src/cmd/compile/internal/ssa/rewrite.go | 9 + .../compile/internal/ssa/rewritegeneric.go | 168 ++++++++++++++++++ test/codegen/comparisons.go | 16 +- 4 files changed, 225 insertions(+), 5 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/generic.rules b/src/cmd/compile/internal/ssa/_gen/generic.rules index fe8fc5b262..7d0d4fbddc 100644 --- a/src/cmd/compile/internal/ssa/_gen/generic.rules +++ b/src/cmd/compile/internal/ssa/_gen/generic.rules @@ -1560,6 +1560,43 @@ (MemEq p q _ _) && isSamePtr(p, q) => (ConstBool [true]) +// 3-32 bytes memeq (enabled only with support of unaligned loads and 8-byte max word size) + +(MemEq p q (Const64 [c]) mem) + && (c == 3 || c == 5 || c == 9 || c == 17) + && canLoadUnaligned(config) + && config.RegSize == 8 + => (AndB (MemEq p q (Const64 [c-1]) mem) + (Eq8 (Load (OffPtr p [c-1]) mem) (Load (OffPtr q [c-1]) mem))) + +(MemEq p q (Const64 [c]) mem) + && (c == 6 || c == 10 || c == 18) + && canLoadUnaligned(config) + && config.RegSize == 8 + => (AndB (MemEq p q (Const64 [c-2]) mem) + (Eq16 (Load (OffPtr p [c-2]) mem) (Load (OffPtr q [c-2]) mem))) + +(MemEq p q (Const64 [c]) mem) + && (c == 7 || c == 11 || c == 19 || c == 20) + && canLoadUnaligned(config) + && config.RegSize == 8 + => (AndB (MemEq p q (Const64 [min(c-3,16)]) mem) + (Eq32 (Load (OffPtr p [c-4]) mem) (Load (OffPtr q [c-4]) mem))) + +(MemEq p q (Const64 [c]) mem) + && ((c >= 12 && c <= 16) || (c >= 21 && c <= 24)) + && canLoadUnaligned(config) + && config.RegSize == 8 + => (AndB (MemEq p q (Const64 [8 + int64(bool2int(c>16))*8]) mem) + (Eq64 (Load (OffPtr p [c-8]) mem) (Load (OffPtr q [c-8]) mem))) + +(MemEq p q (Const64 [c]) mem) + && c >= 25 && c <= 32 + && canLoadUnaligned(config) + && config.RegSize == 8 + => (AndB (MemEq p q (Const64 [16]) mem) + (MemEq (OffPtr p [16]) (OffPtr q [16]) (Const64 [c-16]) mem)) + // Turn known-size calls to memclrNoHeapPointers into a Zero. // Note that we are using types.Types[types.TUINT8] instead of sptr.Type.Elem() - see issue 55122 and CL 431496 for more details. (SelectN [0] call:(StaticCall {sym} sptr (Const(64|32) [c]) mem)) diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 032915f701..6f415e9760 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -2786,3 +2786,12 @@ func imakeOfStructMake(v *Value) *Value { } return v.Block.NewValue2(v.Pos, OpIMake, v.Type, v.Args[0], arg) } + +// bool2int converts bool to int: true to 1, false to 0 +func bool2int(x bool) int { + var b int + if x { + b = 1 + } + return b +} diff --git a/src/cmd/compile/internal/ssa/rewritegeneric.go b/src/cmd/compile/internal/ssa/rewritegeneric.go index dbbb7105af..49c5facc32 100644 --- a/src/cmd/compile/internal/ssa/rewritegeneric.go +++ b/src/cmd/compile/internal/ssa/rewritegeneric.go @@ -14983,6 +14983,174 @@ func rewriteValuegeneric_OpMemEq(v *Value) bool { } break } + // match: (MemEq p q (Const64 [c]) mem) + // cond: (c == 3 || c == 5 || c == 9 || c == 17) && canLoadUnaligned(config) && config.RegSize == 8 + // result: (AndB (MemEq p q (Const64 [c-1]) mem) (Eq8 (Load (OffPtr p [c-1]) mem) (Load (OffPtr q [c-1]) mem))) + for { + p := v_0 + q := v_1 + if v_2.Op != OpConst64 { + break + } + c := auxIntToInt64(v_2.AuxInt) + mem := v_3 + if !((c == 3 || c == 5 || c == 9 || c == 17) && canLoadUnaligned(config) && config.RegSize == 8) { + break + } + v.reset(OpAndB) + v0 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v1 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v1.AuxInt = int64ToAuxInt(c - 1) + v0.AddArg4(p, q, v1, mem) + v2 := b.NewValue0(v.Pos, OpEq8, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLoad, typ.Int8) + v4 := b.NewValue0(v.Pos, OpOffPtr, p.Type) + v4.AuxInt = int64ToAuxInt(c - 1) + v4.AddArg(p) + v3.AddArg2(v4, mem) + v5 := b.NewValue0(v.Pos, OpLoad, typ.Int8) + v6 := b.NewValue0(v.Pos, OpOffPtr, q.Type) + v6.AuxInt = int64ToAuxInt(c - 1) + v6.AddArg(q) + v5.AddArg2(v6, mem) + v2.AddArg2(v3, v5) + v.AddArg2(v0, v2) + return true + } + // match: (MemEq p q (Const64 [c]) mem) + // cond: (c == 6 || c == 10 || c == 18) && canLoadUnaligned(config) && config.RegSize == 8 + // result: (AndB (MemEq p q (Const64 [c-2]) mem) (Eq16 (Load (OffPtr p [c-2]) mem) (Load (OffPtr q [c-2]) mem))) + for { + p := v_0 + q := v_1 + if v_2.Op != OpConst64 { + break + } + c := auxIntToInt64(v_2.AuxInt) + mem := v_3 + if !((c == 6 || c == 10 || c == 18) && canLoadUnaligned(config) && config.RegSize == 8) { + break + } + v.reset(OpAndB) + v0 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v1 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v1.AuxInt = int64ToAuxInt(c - 2) + v0.AddArg4(p, q, v1, mem) + v2 := b.NewValue0(v.Pos, OpEq16, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLoad, typ.Int16) + v4 := b.NewValue0(v.Pos, OpOffPtr, p.Type) + v4.AuxInt = int64ToAuxInt(c - 2) + v4.AddArg(p) + v3.AddArg2(v4, mem) + v5 := b.NewValue0(v.Pos, OpLoad, typ.Int16) + v6 := b.NewValue0(v.Pos, OpOffPtr, q.Type) + v6.AuxInt = int64ToAuxInt(c - 2) + v6.AddArg(q) + v5.AddArg2(v6, mem) + v2.AddArg2(v3, v5) + v.AddArg2(v0, v2) + return true + } + // match: (MemEq p q (Const64 [c]) mem) + // cond: (c == 7 || c == 11 || c == 19 || c == 20) && canLoadUnaligned(config) && config.RegSize == 8 + // result: (AndB (MemEq p q (Const64 [min(c-3,16)]) mem) (Eq32 (Load (OffPtr p [c-4]) mem) (Load (OffPtr q [c-4]) mem))) + for { + p := v_0 + q := v_1 + if v_2.Op != OpConst64 { + break + } + c := auxIntToInt64(v_2.AuxInt) + mem := v_3 + if !((c == 7 || c == 11 || c == 19 || c == 20) && canLoadUnaligned(config) && config.RegSize == 8) { + break + } + v.reset(OpAndB) + v0 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v1 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v1.AuxInt = int64ToAuxInt(min(c-3, 16)) + v0.AddArg4(p, q, v1, mem) + v2 := b.NewValue0(v.Pos, OpEq32, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLoad, typ.Int32) + v4 := b.NewValue0(v.Pos, OpOffPtr, p.Type) + v4.AuxInt = int64ToAuxInt(c - 4) + v4.AddArg(p) + v3.AddArg2(v4, mem) + v5 := b.NewValue0(v.Pos, OpLoad, typ.Int32) + v6 := b.NewValue0(v.Pos, OpOffPtr, q.Type) + v6.AuxInt = int64ToAuxInt(c - 4) + v6.AddArg(q) + v5.AddArg2(v6, mem) + v2.AddArg2(v3, v5) + v.AddArg2(v0, v2) + return true + } + // match: (MemEq p q (Const64 [c]) mem) + // cond: ((c >= 12 && c <= 16) || (c >= 21 && c <= 24)) && canLoadUnaligned(config) && config.RegSize == 8 + // result: (AndB (MemEq p q (Const64 [8 + int64(bool2int(c>16))*8]) mem) (Eq64 (Load (OffPtr p [c-8]) mem) (Load (OffPtr q [c-8]) mem))) + for { + p := v_0 + q := v_1 + if v_2.Op != OpConst64 { + break + } + c := auxIntToInt64(v_2.AuxInt) + mem := v_3 + if !(((c >= 12 && c <= 16) || (c >= 21 && c <= 24)) && canLoadUnaligned(config) && config.RegSize == 8) { + break + } + v.reset(OpAndB) + v0 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v1 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v1.AuxInt = int64ToAuxInt(8 + int64(bool2int(c > 16))*8) + v0.AddArg4(p, q, v1, mem) + v2 := b.NewValue0(v.Pos, OpEq64, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLoad, typ.Int64) + v4 := b.NewValue0(v.Pos, OpOffPtr, p.Type) + v4.AuxInt = int64ToAuxInt(c - 8) + v4.AddArg(p) + v3.AddArg2(v4, mem) + v5 := b.NewValue0(v.Pos, OpLoad, typ.Int64) + v6 := b.NewValue0(v.Pos, OpOffPtr, q.Type) + v6.AuxInt = int64ToAuxInt(c - 8) + v6.AddArg(q) + v5.AddArg2(v6, mem) + v2.AddArg2(v3, v5) + v.AddArg2(v0, v2) + return true + } + // match: (MemEq p q (Const64 [c]) mem) + // cond: c >= 25 && c <= 32 && canLoadUnaligned(config) && config.RegSize == 8 + // result: (AndB (MemEq p q (Const64 [16]) mem) (MemEq (OffPtr p [16]) (OffPtr q [16]) (Const64 [c-16]) mem)) + for { + p := v_0 + q := v_1 + if v_2.Op != OpConst64 { + break + } + c := auxIntToInt64(v_2.AuxInt) + mem := v_3 + if !(c >= 25 && c <= 32 && canLoadUnaligned(config) && config.RegSize == 8) { + break + } + v.reset(OpAndB) + v0 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v1 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v1.AuxInt = int64ToAuxInt(16) + v0.AddArg4(p, q, v1, mem) + v2 := b.NewValue0(v.Pos, OpMemEq, typ.Bool) + v3 := b.NewValue0(v.Pos, OpOffPtr, p.Type) + v3.AuxInt = int64ToAuxInt(16) + v3.AddArg(p) + v4 := b.NewValue0(v.Pos, OpOffPtr, q.Type) + v4.AuxInt = int64ToAuxInt(16) + v4.AddArg(q) + v5 := b.NewValue0(v.Pos, OpConst64, typ.Int64) + v5.AuxInt = int64ToAuxInt(c - 16) + v2.AddArg4(v3, v4, v5, mem) + v.AddArg2(v0, v2) + return true + } return false } func rewriteValuegeneric_OpMod16(v *Value) bool { diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go index 0b550adc05..70852377f7 100644 --- a/test/codegen/comparisons.go +++ b/test/codegen/comparisons.go @@ -661,16 +661,22 @@ func equalVarString8(a string) bool { } func equalVarStringNoSpill(a, b string) bool { - s := string("ZZZZZZZZZ") + s := string("123456789012345678901234567890123") // arm64:".*memequal" - memeq1 := a[:9] == s + memeq1 := a[:33] == s // arm64:-".*" - memeq2 := s == a[:9] - // arm64:-"MOVB R0,.*SP",".*memequal" - memeq3 := s == b[:9] + memeq2 := s == a[:33] + // arm64:-"MOVB R0,.*SP" ".*memequal" + memeq3 := s == b[:33] return memeq1 && memeq2 && memeq3 } +func equalVarString17(a string) bool { + b := string("12345678901234567") + // arm64:-".*memequal" "CMPW [$]55," "MOVD [$]3906085646303834169," "MOVD [$]4050765991979987505," + return a[:17] == b +} + func cmpToCmn(a, b, c, d int) int { var c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11 int // arm64:`CMN`,-`CMP` -- 2.52.0