From 9eb53ab9bc3b89d960c23ab47b3d7bc3fc201fd7 Mon Sep 17 00:00:00 2001 From: Brian Kessler Date: Tue, 14 Aug 2018 16:41:22 -0600 Subject: [PATCH] cmd/compile: intrinsify math/bits.Mul MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add SSA rules to intrinsify Mul/Mul64 (AMD64 and ARM64). SSA rules for other functions and architectures are left as a future optimization. Benchmark results on AMD64/ARM64 before and after SSA implementation are below. amd64 name old time/op new time/op delta Add-4 1.78ns ± 0% 1.85ns ±12% ~ (p=0.397 n=4+5) Add32-4 1.71ns ± 1% 1.70ns ± 0% ~ (p=0.683 n=5+5) Add64-4 1.80ns ± 2% 1.77ns ± 0% -1.22% (p=0.048 n=5+5) Sub-4 1.78ns ± 0% 1.78ns ± 0% ~ (all equal) Sub32-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=1.000 n=5+5) Sub64-4 1.78ns ± 1% 1.78ns ± 0% ~ (p=0.968 n=5+4) Mul-4 11.5ns ± 1% 1.8ns ± 2% -84.39% (p=0.008 n=5+5) Mul32-4 1.39ns ± 0% 1.38ns ± 3% ~ (p=0.175 n=5+5) Mul64-4 6.85ns ± 1% 1.78ns ± 1% -73.97% (p=0.008 n=5+5) Div-4 57.1ns ± 1% 56.7ns ± 0% ~ (p=0.087 n=5+5) Div32-4 18.0ns ± 0% 18.0ns ± 0% ~ (all equal) Div64-4 56.4ns ±10% 53.6ns ± 1% ~ (p=0.071 n=5+5) arm64 name old time/op new time/op delta Add-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Add64-96 5.52ns ± 0% 5.51ns ± 0% ~ (p=0.444 n=5+5) Sub-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub32-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Sub64-96 5.51ns ± 0% 5.51ns ± 0% ~ (all equal) Mul-96 34.6ns ± 0% 5.0ns ± 0% -85.52% (p=0.008 n=5+5) Mul32-96 4.51ns ± 0% 4.51ns ± 0% ~ (all equal) Mul64-96 21.1ns ± 0% 5.0ns ± 0% -76.26% (p=0.008 n=5+5) Div-96 64.7ns ± 0% 64.7ns ± 0% ~ (all equal) Div32-96 17.0ns ± 0% 17.0ns ± 0% ~ (all equal) Div64-96 53.1ns ± 0% 53.1ns ± 0% ~ (all equal) Updates #24813 Change-Id: I9bda6d2102f65cae3d436a2087b47ed8bafeb068 Reviewed-on: https://go-review.googlesource.com/129415 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/gc/ssa.go | 6 ++++++ test/codegen/mathbits.go | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 2ee966d890..8df8023d18 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3435,6 +3435,12 @@ func init() { addF("math/bits", "OnesCount", makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32), sys.AMD64) + alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64) + addF("math/bits", "Mul64", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1]) + }, + sys.AMD64, sys.ARM64) /******** sync/atomic ********/ diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 28354ed651..834b08f101 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -302,3 +302,19 @@ func IterateBits8(n uint8) int { } return i } + +// --------------- // +// bits.Mul* // +// --------------- // + +func Mul(x, y uint) (hi, lo uint) { + // amd64:"MULQ" + // arm64:"UMULH","MUL" + return bits.Mul(x, y) +} + +func Mul64(x, y uint64) (hi, lo uint64) { + // amd64:"MULQ" + // arm64:"UMULH","MUL" + return bits.Mul64(x, y) +} -- 2.50.0