When lowering some ops on amd64 we generate additional NEGQ.
This may result in code like this:
NEGQ R12
NEGQ R12
Optimize it away. Gain is not significant, about ~0.5% gain in geomean
in compress/flate and 200 bytes codesize reduction in go tool.
Full results below:
name old time/op new time/op delta
Encode/Digits/Huffman/1e4-6 65.8µs ± 0% 65.7µs ± 0% -0.21% (p=0.010 n=10+9)
Encode/Digits/Huffman/1e5-6 633µs ± 0% 632µs ± 0% ~ (p=0.370 n=8+9)
Encode/Digits/Huffman/1e6-6 6.30ms ± 1% 6.29ms ± 1% ~ (p=0.796 n=10+10)
Encode/Digits/Speed/1e4-6 281µs ± 0% 280µs ± 1% -0.34% (p=0.043 n=8+10)
Encode/Digits/Speed/1e5-6 2.66ms ± 0% 2.66ms ± 0% -0.09% (p=0.043 n=10+10)
Encode/Digits/Speed/1e6-6 26.3ms ± 0% 26.3ms ± 0% ~ (p=0.190 n=10+10)
Encode/Digits/Default/1e4-6 554µs ± 0% 557µs ± 0% +0.46% (p=0.001 n=9+10)
Encode/Digits/Default/1e5-6 8.63ms ± 1% 8.62ms ± 1% ~ (p=0.912 n=10+10)
Encode/Digits/Default/1e6-6 92.7ms ± 1% 92.2ms ± 1% ~ (p=0.052 n=10+10)
Encode/Digits/Compression/1e4-6 558µs ± 1% 557µs ± 1% ~ (p=0.481 n=10+10)
Encode/Digits/Compression/1e5-6 8.58ms ± 0% 8.61ms ± 1% ~ (p=0.315 n=8+10)
Encode/Digits/Compression/1e6-6 92.3ms ± 1% 92.4ms ± 1% ~ (p=0.971 n=10+10)
Encode/Twain/Huffman/1e4-6 89.5µs ± 0% 89.0µs ± 1% -0.48% (p=0.001 n=9+9)
Encode/Twain/Huffman/1e5-6 727µs ± 1% 728µs ± 0% ~ (p=0.604 n=10+9)
Encode/Twain/Huffman/1e6-6 7.21ms ± 0% 7.19ms ± 1% ~ (p=0.696 n=8+10)
Encode/Twain/Speed/1e4-6 320µs ± 1% 321µs ± 1% ~ (p=0.353 n=10+10)
Encode/Twain/Speed/1e5-6 2.63ms ± 0% 2.62ms ± 1% -0.33% (p=0.016 n=8+10)
Encode/Twain/Speed/1e6-6 25.8ms ± 0% 25.8ms ± 0% ~ (p=0.360 n=10+8)
Encode/Twain/Default/1e4-6 677µs ± 1% 671µs ± 1% -0.88% (p=0.000 n=10+10)
Encode/Twain/Default/1e5-6 10.5ms ± 1% 10.3ms ± 0% -2.06% (p=0.000 n=10+10)
Encode/Twain/Default/1e6-6 113ms ± 1% 111ms ± 1% -1.96% (p=0.000 n=10+9)
Encode/Twain/Compression/1e4-6 688µs ± 0% 679µs ± 1% -1.30% (p=0.000 n=7+10)
Encode/Twain/Compression/1e5-6 11.6ms ± 1% 11.3ms ± 1% -2.10% (p=0.000 n=10+10)
Encode/Twain/Compression/1e6-6 126ms ± 1% 124ms ± 0% -1.57% (p=0.000 n=10+10)
[Geo mean] 3.45ms 3.44ms -0.46%
name old speed new speed delta
Encode/Digits/Huffman/1e4-6 152MB/s ± 0% 152MB/s ± 0% +0.21% (p=0.009 n=10+9)
Encode/Digits/Huffman/1e5-6 158MB/s ± 0% 158MB/s ± 0% ~ (p=0.336 n=8+9)
Encode/Digits/Huffman/1e6-6 159MB/s ± 1% 159MB/s ± 1% ~ (p=0.781 n=10+10)
Encode/Digits/Speed/1e4-6 35.6MB/s ± 0% 35.7MB/s ± 1% +0.34% (p=0.020 n=8+10)
Encode/Digits/Speed/1e5-6 37.6MB/s ± 0% 37.7MB/s ± 0% +0.09% (p=0.049 n=10+10)
Encode/Digits/Speed/1e6-6 38.0MB/s ± 0% 38.0MB/s ± 0% ~ (p=0.146 n=10+10)
Encode/Digits/Default/1e4-6 18.0MB/s ± 0% 18.0MB/s ± 0% -0.45% (p=0.002 n=9+10)
Encode/Digits/Default/1e5-6 11.6MB/s ± 1% 11.6MB/s ± 1% ~ (p=0.644 n=10+10)
Encode/Digits/Default/1e6-6 10.8MB/s ± 1% 10.8MB/s ± 1% +0.51% (p=0.044 n=10+10)
Encode/Digits/Compression/1e4-6 17.9MB/s ± 1% 17.9MB/s ± 1% ~ (p=0.468 n=10+10)
Encode/Digits/Compression/1e5-6 11.7MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.322 n=8+10)
Encode/Digits/Compression/1e6-6 10.8MB/s ± 1% 10.8MB/s ± 1% ~ (p=0.983 n=10+10)
Encode/Twain/Huffman/1e4-6 112MB/s ± 0% 112MB/s ± 1% +0.42% (p=0.002 n=8+9)
Encode/Twain/Huffman/1e5-6 138MB/s ± 1% 137MB/s ± 0% ~ (p=0.616 n=10+9)
Encode/Twain/Huffman/1e6-6 139MB/s ± 0% 139MB/s ± 1% ~ (p=0.652 n=8+10)
Encode/Twain/Speed/1e4-6 31.3MB/s ± 1% 31.2MB/s ± 1% ~ (p=0.342 n=10+10)
Encode/Twain/Speed/1e5-6 38.0MB/s ± 0% 38.1MB/s ± 1% +0.33% (p=0.011 n=8+10)
Encode/Twain/Speed/1e6-6 38.8MB/s ± 0% 38.7MB/s ± 0% ~ (p=0.325 n=10+8)
Encode/Twain/Default/1e4-6 14.8MB/s ± 1% 14.9MB/s ± 1% +0.88% (p=0.000 n=10+10)
Encode/Twain/Default/1e5-6 9.48MB/s ± 1% 9.68MB/s ± 0% +2.11% (p=0.000 n=10+10)
Encode/Twain/Default/1e6-6 8.86MB/s ± 1% 9.03MB/s ± 1% +1.97% (p=0.000 n=10+9)
Encode/Twain/Compression/1e4-6 14.5MB/s ± 0% 14.7MB/s ± 1% +1.31% (p=0.000 n=7+10)
Encode/Twain/Compression/1e5-6 8.63MB/s ± 1% 8.82MB/s ± 1% +2.17% (p=0.000 n=10+10)
Encode/Twain/Compression/1e6-6 7.92MB/s ± 1% 8.05MB/s ± 1% +1.59% (p=0.000 n=10+10)
[Geo mean] 29.0MB/s 29.1MB/s +0.47%
// symSizeComp `which go` go_old:
section differences:
global text (code) = 203 bytes (0.005131%)
read-only data = 1 bytes (0.000057%)
Total difference 204 bytes (0.003297%)
Change-Id: Ie2cdfa1216472d78694fff44d215b3b8e71cf7bf
Reviewed-on: https://go-review.googlesource.com/102277
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
// If we cared, we might do:
// (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 -> (MOVLconst [0])
+// Remove redundant ops
+// Not in generic rules, because they may appear after lowering e. g. Slicemask
+(NEG(Q|L) (NEG(Q|L) x)) -> x
+
// Convert constant subtracts to constant adds
(SUBQconst [c] x) && c != -(1<<31) -> (ADDQconst [-c] x)
(SUBLconst [c] x) -> (ADDLconst [int64(int32(-c))] x)
return false
}
func rewriteValueAMD64_OpAMD64NEGL_0(v *Value) bool {
+ // match: (NEGL (NEGL x))
+ // cond:
+ // result: x
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpAMD64NEGL {
+ break
+ }
+ x := v_0.Args[0]
+ v.reset(OpCopy)
+ v.Type = x.Type
+ v.AddArg(x)
+ return true
+ }
// match: (NEGL (MOVLconst [c]))
// cond:
// result: (MOVLconst [int64(int32(-c))])
return false
}
func rewriteValueAMD64_OpAMD64NEGQ_0(v *Value) bool {
+ // match: (NEGQ (NEGQ x))
+ // cond:
+ // result: x
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpAMD64NEGQ {
+ break
+ }
+ x := v_0.Args[0]
+ v.reset(OpCopy)
+ v.Type = x.Type
+ v.AddArg(x)
+ return true
+ }
// match: (NEGQ (MOVQconst [c]))
// cond:
// result: (MOVQconst [-c])