From e071617222e373f59db8995ce171bae708e0dcef Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Thu, 22 May 2025 16:21:10 +0800 Subject: [PATCH] cmd/compile: optimize multiplication rules on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Improve multiplication strength reduction, refer to CL 626998, add additional 3 linear combination instructions for loong64. goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI32/120 1.6010n ± 0% 0.8130n ± 0% -49.22% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8109n ± 0% -49.35% (p=0.000 n=10) MulconstI32/65537 1.6275n ± 0% 0.8005n ± 0% -50.81% (p=0.000 n=10) MulconstI32/65538 1.6290n ± 0% 0.8004n ± 0% -50.87% (p=0.000 n=10) MulconstI64/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/-120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/65537 1.6270n ± 0% 0.8005n ± 0% -50.80% (p=0.000 n=10) MulconstI64/65538 1.6290n ± 0% 0.8071n ± 1% -50.45% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8066n ± 0% -49.62% (p=0.000 n=10) MulconstU32/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU32/65538 1.6280n ± 0% 0.8005n ± 0% -50.83% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU64/65538 1.6300n ± 0% 0.8067n ± 0% -50.51% (p=0.000 n=10) geomean 1.609n 0.8537n -46.95% goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstI32/120 1.6020n ± 0% 0.8012n ± 0% -49.99% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/65537 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI32/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/3 1.6015n ± 0% 0.8007n ± 0% -50.00% (p=0.000 n=10) MulconstI64/5 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/12 1.602n ± 0% 1.202n ± 0% -25.00% (p=0.000 n=10) MulconstI64/120 1.6030n ± 0% 0.8011n ± 0% -50.02% (p=0.000 n=10) MulconstI64/-120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65538 1.6020n ± 0% 0.8009n ± 0% -50.01% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8007n ± 0% -49.98% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstU64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) geomean 1.601n 0.8523n -46.77% Change-Id: I9fb0e47ca57875da171a347bf4828adfab41b875 Reviewed-on: https://go-review.googlesource.com/c/go/+/675455 Reviewed-by: Mark Freeman Reviewed-by: abner chenc Reviewed-by: Keith Randall Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Auto-Submit: Keith Randall --- .../compile/internal/ssa/_gen/LOONG64.rules | 4 +- .../internal/ssa/_gen/LOONG64latelower.rules | 6 ++ src/cmd/compile/internal/ssa/config.go | 86 ++++++++++++++++++- .../compile/internal/ssa/rewriteLOONG64.go | 26 ++---- .../internal/ssa/rewriteLOONG64latelower.go | 29 +++++++ test/codegen/arithmetic.go | 4 + test/codegen/multiply.go | 60 +++++++++++++ 7 files changed, 190 insertions(+), 25 deletions(-) create mode 100644 src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules create mode 100644 src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules index 3232af1e55..9d0ad0148f 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules @@ -750,10 +750,10 @@ (SRLVconst [rc] (MOVBUreg x)) && rc >= 8 => (MOVVconst [0]) // mul by constant -(MULV x (MOVVconst [-1])) => (NEGV x) (MULV _ (MOVVconst [0])) => (MOVVconst [0]) (MULV x (MOVVconst [1])) => x -(MULV x (MOVVconst [c])) && isPowerOfTwo(c) => (SLLVconst [log64(c)] x) + +(MULV x (MOVVconst [c])) && canMulStrengthReduce(config, c) => {mulStrengthReduce(v, x, c)} // div by constant (DIVVU x (MOVVconst [1])) => x diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules new file mode 100644 index 0000000000..95844381c2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules @@ -0,0 +1,6 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Prefer addition when shifting left by one. +(SLLVconst [1] x) => (ADDV x x) diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go index d4cd32a0d7..50ec2ec177 100644 --- a/src/cmd/compile/internal/ssa/config.go +++ b/src/cmd/compile/internal/ssa/config.go @@ -283,6 +283,8 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo c.RegSize = 8 c.lowerBlock = rewriteBlockLOONG64 c.lowerValue = rewriteValueLOONG64 + c.lateLowerBlock = rewriteBlockLOONG64latelower + c.lateLowerValue = rewriteValueLOONG64latelower c.registers = registersLOONG64[:] c.gpRegMask = gpRegMaskLOONG64 c.fpRegMask = fpRegMaskLOONG64 @@ -562,6 +564,43 @@ func (c *Config) buildRecipes(arch string) { return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y) }) } + case "loong64": + // - multiply is 4 cycles. + // - add/sub/shift are 1 cycle. + // On loong64, using a multiply also needs to load the constant into a register. + // TODO: figure out a happy medium. + mulCost = 45 + + // add + r(1, 1, 10, + func(m, x, y *Value) *Value { + return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y) + }) + // neg + r(-1, 0, 10, + func(m, x, y *Value) *Value { + return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x) + }) + // sub + r(1, -1, 10, + func(m, x, y *Value) *Value { + return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y) + }) + + // regular shifts + for i := 1; i < 64; i++ { + c := 10 + if i == 1 { + // Prefer x<<1 over x+x. + // Note that we eventually reverse this decision in LOONG64latelower.rules, + // but this makes shift combining rules in LOONG64.rules simpler. + c-- + } + r(1<