From: Alexandru Moșoi Date: Thu, 24 Mar 2016 21:46:37 +0000 (+0100) Subject: cmd/compile: generalize strength reduction of mulq X-Git-Tag: go1.7beta1~1003 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=d8f1f8d8568d680be0845379d477264fd09324c3;p=gostls13.git cmd/compile: generalize strength reduction of mulq * This is an improved version of an earlier patch. * Verified with gcc up to 100. * Limited to two instructions based on costs from https://gmplib.org/~tege/x86-timing.pdf Change-Id: Ib7c37de6fd8e0ba554459b15c7409508cbcf6728 Reviewed-on: https://go-review.googlesource.com/21103 Reviewed-by: Keith Randall Run-TryBot: Alexandru Moșoi TryBot-Result: Gobot Gobot --- diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 7ed2027135..0b1ce13e9e 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -583,14 +583,35 @@ (CMPB (MOVBconst [c]) x) -> (InvertFlags (CMPBconst x [c])) // strength reduction +// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf: +// 1 - addq, shlq, leaq, negq +// 3 - imulq +// This limits the rewrites to two instructions. +// TODO: 27, 81 (MULQconst [-1] x) -> (NEGQ x) (MULQconst [0] _) -> (MOVQconst [0]) (MULQconst [1] x) -> x (MULQconst [3] x) -> (LEAQ2 x x) (MULQconst [5] x) -> (LEAQ4 x x) +(MULQconst [7] x) -> (LEAQ8 (NEGQ x) x) (MULQconst [9] x) -> (LEAQ8 x x) -(MULQconst [24] x) -> (SHLQconst [3] (LEAQ2 x x)) // Useful for [][]T accesses +(MULQconst [11] x) -> (LEAQ2 x (LEAQ4 x x)) +(MULQconst [13] x) -> (LEAQ4 x (LEAQ2 x x)) +(MULQconst [21] x) -> (LEAQ4 x (LEAQ4 x x)) +(MULQconst [25] x) -> (LEAQ8 x (LEAQ2 x x)) +(MULQconst [37] x) -> (LEAQ4 x (LEAQ8 x x)) +(MULQconst [41] x) -> (LEAQ8 x (LEAQ4 x x)) +(MULQconst [73] x) -> (LEAQ8 x (LEAQ8 x x)) + (MULQconst [c] x) && isPowerOfTwo(c) -> (SHLQconst [log2(c)] x) +(MULQconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBQ (SHLQconst [log2(c+1)] x) x) +(MULQconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (LEAQ1 (SHLQconst [log2(c-1)] x) x) +(MULQconst [c] x) && isPowerOfTwo(c-2) && c >= 34 -> (LEAQ2 (SHLQconst [log2(c-2)] x) x) +(MULQconst [c] x) && isPowerOfTwo(c-4) && c >= 68 -> (LEAQ4 (SHLQconst [log2(c-4)] x) x) +(MULQconst [c] x) && isPowerOfTwo(c-8) && c >= 136 -> (LEAQ8 (SHLQconst [log2(c-8)] x) x) +(MULQconst [c] x) && c%3 == 0 && isPowerOfTwo(c/3)-> (SHLQconst [log2(c/3)] (LEAQ2 x x)) +(MULQconst [c] x) && c%5 == 0 && isPowerOfTwo(c/5)-> (SHLQconst [log2(c/5)] (LEAQ4 x x)) +(MULQconst [c] x) && c%9 == 0 && isPowerOfTwo(c/9)-> (SHLQconst [log2(c/9)] (LEAQ8 x x)) // combine add/shift into LEAQ (ADDQ x (SHLQconst [3] y)) -> (LEAQ8 x y) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 48257f5402..99ffb66f65 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -9392,6 +9392,21 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MULQconst [7] x) + // cond: + // result: (LEAQ8 (NEGQ x) x) + for { + if v.AuxInt != 7 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ8) + v0 := b.NewValue0(v.Line, OpAMD64NEGQ, v.Type) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } // match: (MULQconst [9] x) // cond: // result: (LEAQ8 x x) @@ -9405,22 +9420,118 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool { v.AddArg(x) return true } - // match: (MULQconst [24] x) + // match: (MULQconst [11] x) // cond: - // result: (SHLQconst [3] (LEAQ2 x x)) + // result: (LEAQ2 x (LEAQ4 x x)) for { - if v.AuxInt != 24 { + if v.AuxInt != 11 { break } x := v.Args[0] - v.reset(OpAMD64SHLQconst) - v.AuxInt = 3 + v.reset(OpAMD64LEAQ2) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [13] x) + // cond: + // result: (LEAQ4 x (LEAQ2 x x)) + for { + if v.AuxInt != 13 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ4) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [21] x) + // cond: + // result: (LEAQ4 x (LEAQ4 x x)) + for { + if v.AuxInt != 21 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ4) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [25] x) + // cond: + // result: (LEAQ8 x (LEAQ2 x x)) + for { + if v.AuxInt != 25 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ8) + v.AddArg(x) v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type) v0.AddArg(x) v0.AddArg(x) v.AddArg(v0) return true } + // match: (MULQconst [37] x) + // cond: + // result: (LEAQ4 x (LEAQ8 x x)) + for { + if v.AuxInt != 37 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ4) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [41] x) + // cond: + // result: (LEAQ8 x (LEAQ4 x x)) + for { + if v.AuxInt != 41 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ8) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [73] x) + // cond: + // result: (LEAQ8 x (LEAQ8 x x)) + for { + if v.AuxInt != 73 { + break + } + x := v.Args[0] + v.reset(OpAMD64LEAQ8) + v.AddArg(x) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } // match: (MULQconst [c] x) // cond: isPowerOfTwo(c) // result: (SHLQconst [log2(c)] x) @@ -9435,6 +9546,142 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MULQconst [c] x) + // cond: isPowerOfTwo(c+1) && c >= 15 + // result: (SUBQ (SHLQconst [log2(c+1)] x) x) + for { + c := v.AuxInt + x := v.Args[0] + if !(isPowerOfTwo(c+1) && c >= 15) { + break + } + v.reset(OpAMD64SUBQ) + v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type) + v0.AuxInt = log2(c + 1) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } + // match: (MULQconst [c] x) + // cond: isPowerOfTwo(c-1) && c >= 17 + // result: (LEAQ1 (SHLQconst [log2(c-1)] x) x) + for { + c := v.AuxInt + x := v.Args[0] + if !(isPowerOfTwo(c-1) && c >= 17) { + break + } + v.reset(OpAMD64LEAQ1) + v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type) + v0.AuxInt = log2(c - 1) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } + // match: (MULQconst [c] x) + // cond: isPowerOfTwo(c-2) && c >= 34 + // result: (LEAQ2 (SHLQconst [log2(c-2)] x) x) + for { + c := v.AuxInt + x := v.Args[0] + if !(isPowerOfTwo(c-2) && c >= 34) { + break + } + v.reset(OpAMD64LEAQ2) + v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type) + v0.AuxInt = log2(c - 2) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } + // match: (MULQconst [c] x) + // cond: isPowerOfTwo(c-4) && c >= 68 + // result: (LEAQ4 (SHLQconst [log2(c-4)] x) x) + for { + c := v.AuxInt + x := v.Args[0] + if !(isPowerOfTwo(c-4) && c >= 68) { + break + } + v.reset(OpAMD64LEAQ4) + v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type) + v0.AuxInt = log2(c - 4) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } + // match: (MULQconst [c] x) + // cond: isPowerOfTwo(c-8) && c >= 136 + // result: (LEAQ8 (SHLQconst [log2(c-8)] x) x) + for { + c := v.AuxInt + x := v.Args[0] + if !(isPowerOfTwo(c-8) && c >= 136) { + break + } + v.reset(OpAMD64LEAQ8) + v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type) + v0.AuxInt = log2(c - 8) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } + // match: (MULQconst [c] x) + // cond: c%3 == 0 && isPowerOfTwo(c/3) + // result: (SHLQconst [log2(c/3)] (LEAQ2 x x)) + for { + c := v.AuxInt + x := v.Args[0] + if !(c%3 == 0 && isPowerOfTwo(c/3)) { + break + } + v.reset(OpAMD64SHLQconst) + v.AuxInt = log2(c / 3) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [c] x) + // cond: c%5 == 0 && isPowerOfTwo(c/5) + // result: (SHLQconst [log2(c/5)] (LEAQ4 x x)) + for { + c := v.AuxInt + x := v.Args[0] + if !(c%5 == 0 && isPowerOfTwo(c/5)) { + break + } + v.reset(OpAMD64SHLQconst) + v.AuxInt = log2(c / 5) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } + // match: (MULQconst [c] x) + // cond: c%9 == 0 && isPowerOfTwo(c/9) + // result: (SHLQconst [log2(c/9)] (LEAQ8 x x)) + for { + c := v.AuxInt + x := v.Args[0] + if !(c%9 == 0 && isPowerOfTwo(c/9)) { + break + } + v.reset(OpAMD64SHLQconst) + v.AuxInt = log2(c / 9) + v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + return true + } // match: (MULQconst [c] (MOVQconst [d])) // cond: // result: (MOVQconst [c*d]) diff --git a/test/strength.go b/test/strength.go new file mode 100644 index 0000000000..94d589c240 --- /dev/null +++ b/test/strength.go @@ -0,0 +1,45 @@ +// runoutput + +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Generate test of strength reduction for multiplications +// with contstants. Especially useful for amd64/386. + +package main + +import "fmt" + +func testMul(fact, bits int) string { + n := fmt.Sprintf("testMul_%d_%d", fact, bits) + fmt.Printf("func %s(s int%d) {\n", n, bits) + + want := 0 + for i := 0; i < 200; i++ { + fmt.Printf(` if want, got := int%d(%d), s*%d; want != got { + failed = true + fmt.Printf("got %d * %%d == %%d, wanted %d\n", s, got) + } +`, bits, want, i, i, want) + want += fact + } + + fmt.Printf("}\n") + return fmt.Sprintf("%s(%d)", n, fact) +} + +func main() { + fmt.Printf("package main\n") + fmt.Printf("import \"fmt\"\n") + fmt.Printf("var failed = false\n") + + f1 := testMul(17, 32) + f2 := testMul(131, 64) + + fmt.Printf("func main() {\n") + fmt.Println(f1) + fmt.Println(f2) + fmt.Printf("if failed {\n panic(\"multiplication failed\")\n}\n") + fmt.Printf("}\n") +}