]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: improve fractional word zeroing
authorBalaram Makam <bmakam.qdt@qualcommdatacenter.com>
Fri, 23 Feb 2018 18:28:48 +0000 (13:28 -0500)
committerCherry Zhang <cherryyz@google.com>
Wed, 28 Feb 2018 23:28:39 +0000 (23:28 +0000)
This change improves fractional word zeroing by
using overlapping MOVDs for the fractions.

Performance of go1 benchmarks on Amberwing was all noise:
name                   old time/op    new time/op    delta
RegexpMatchEasy0_32       247ns ± 0%     246ns ± 0%  -0.40%  (p=0.008 n=5+5)
RegexpMatchEasy0_1K       581ns ± 0%     579ns ± 0%  -0.34%  (p=0.000 n=5+4)
RegexpMatchEasy1_32       244ns ± 0%     242ns ± 0%    ~     (p=0.079 n=4+5)
RegexpMatchEasy1_1K       804ns ± 0%     805ns ± 0%    ~     (p=0.238 n=5+4)
RegexpMatchMedium_32      313ns ± 0%     311ns ± 0%  -0.64%  (p=0.008 n=5+5)
RegexpMatchMedium_1K     52.2µs ± 0%    51.9µs ± 0%  -0.52%  (p=0.016 n=5+4)
RegexpMatchHard_32       2.75µs ± 0%    2.74µs ± 0%    ~     (p=0.603 n=5+5)
RegexpMatchHard_1K       78.8µs ± 0%    78.9µs ± 0%  +0.05%  (p=0.008 n=5+5)
FmtFprintfEmpty          58.6ns ± 0%    58.6ns ± 0%    ~     (p=0.159 n=5+5)
FmtFprintfString          118ns ± 0%     119ns ± 0%  +0.85%  (p=0.008 n=5+5)
FmtFprintfInt             119ns ± 0%     123ns ± 0%  +3.36%  (p=0.016 n=5+4)
FmtFprintfIntInt          192ns ± 0%     200ns ± 0%  +4.17%  (p=0.008 n=5+5)
FmtFprintfPrefixedInt     224ns ± 0%     209ns ± 0%  -6.70%  (p=0.008 n=5+5)
FmtFprintfFloat           335ns ± 0%     335ns ± 0%    ~     (all equal)
FmtManyArgs               775ns ± 0%     811ns ± 1%  +4.67%  (p=0.016 n=4+5)
Gzip                      437ms ± 0%     438ms ± 0%  +0.19%  (p=0.008 n=5+5)
HTTPClientServer         88.7µs ± 1%    90.3µs ± 1%  +1.75%  (p=0.016 n=5+5)
JSONEncode               20.1ms ± 1%    20.1ms ± 0%    ~     (p=1.000 n=5+5)
JSONDecode               94.7ms ± 1%    94.8ms ± 1%    ~     (p=0.548 n=5+5)
GobDecode                12.8ms ± 1%    12.8ms ± 1%    ~     (p=0.548 n=5+5)
GobEncode                12.1ms ± 0%    12.1ms ± 0%    ~     (p=0.151 n=5+5)
Mandelbrot200            5.37ms ± 0%    5.37ms ± 0%  -0.03%  (p=0.008 n=5+5)
TimeParse                 450ns ± 0%     451ns ± 1%    ~     (p=0.635 n=4+5)
TimeFormat                485ns ± 0%     484ns ± 0%    ~     (p=0.508 n=5+5)
Template                 90.4ms ± 0%    90.2ms ± 0%  -0.24%  (p=0.016 n=5+5)
GoParse                  5.98ms ± 0%    5.98ms ± 0%    ~     (p=1.000 n=5+5)
BinaryTree17              11.8s ± 0%     11.8s ± 0%    ~     (p=0.841 n=5+5)
Revcomp                   669ms ± 0%     669ms ± 0%    ~     (p=0.310 n=5+5)
Fannkuch11                3.28s ± 0%     3.34s ± 0%  +1.64%  (p=0.008 n=5+5)

name                   old speed      new speed      delta
RegexpMatchEasy0_32     129MB/s ± 0%   130MB/s ± 0%  +0.30%  (p=0.016 n=4+5)
RegexpMatchEasy0_1K    1.76GB/s ± 0%  1.77GB/s ± 0%  +0.27%  (p=0.016 n=5+4)
RegexpMatchEasy1_32     131MB/s ± 0%   132MB/s ± 0%  +0.71%  (p=0.016 n=4+5)
RegexpMatchEasy1_1K    1.27GB/s ± 0%  1.27GB/s ± 0%  -0.17%  (p=0.016 n=5+4)
RegexpMatchMedium_32   3.19MB/s ± 0%  3.21MB/s ± 0%  +0.63%  (p=0.008 n=5+5)
RegexpMatchMedium_1K   19.6MB/s ± 0%  19.7MB/s ± 0%  +0.52%  (p=0.016 n=5+4)
RegexpMatchHard_32     11.7MB/s ± 0%  11.7MB/s ± 0%    ~     (p=0.643 n=5+5)
RegexpMatchHard_1K     13.0MB/s ± 0%  13.0MB/s ± 0%    ~     (p=0.079 n=4+5)
Gzip                   44.4MB/s ± 0%  44.3MB/s ± 0%  -0.19%  (p=0.008 n=5+5)
JSONEncode             96.3MB/s ± 1%  96.4MB/s ± 0%    ~     (p=1.000 n=5+5)
JSONDecode             20.5MB/s ± 1%  20.5MB/s ± 1%    ~     (p=0.460 n=5+5)
GobDecode              60.1MB/s ± 1%  59.9MB/s ± 1%    ~     (p=0.548 n=5+5)
GobEncode              63.5MB/s ± 0%  63.7MB/s ± 0%    ~     (p=0.135 n=5+5)
Template               21.5MB/s ± 0%  21.5MB/s ± 0%  +0.24%  (p=0.016 n=5+5)
GoParse                9.68MB/s ± 0%  9.69MB/s ± 0%    ~     (p=0.786 n=5+5)
Revcomp                 380MB/s ± 0%   380MB/s ± 0%    ~     (p=0.310 n=5+5)
Change-Id: I596eee6421cdbad1a0189cdb9fe0628bba534eaf
Reviewed-on: https://go-review.googlesource.com/96775
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/cmd/compile/internal/gc/asm_test.go
src/cmd/compile/internal/ssa/gen/ARM64.rules
src/cmd/compile/internal/ssa/rewriteARM64.go

index c45615ae3a08123da2c5f979c4910b97c9fc8521..c2fc9862f3b08a772b7a3fb38a47f0da5773dbae 100644 (file)
@@ -3245,6 +3245,24 @@ var linuxARM64Tests = []*asmTest{
                pos: []string{"STP"},
                neg: []string{"MOVB", "MOVH"},
        },
+       {
+               fn: `
+               func $(a *[39]byte) {
+                       *a = [39]byte{}
+               }
+               `,
+               pos: []string{"MOVD"},
+               neg: []string{"MOVB", "MOVH", "MOVW"},
+       },
+       {
+               fn: `
+               func $(a *[30]byte) {
+                       *a = [30]byte{}
+               }
+               `,
+               pos: []string{"STP"},
+               neg: []string{"MOVB", "MOVH", "MOVW"},
+       },
 }
 
 var linuxMIPSTests = []*asmTest{
index 9f6ef57d434b94134e5947b03f8e905483864926..c6057f246130b6481575f227827fcd6a83a63f90 100644 (file)
                                (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
 
 // strip off fractional word zeroing
-(Zero [s] ptr mem) && s%16 != 0 && s > 16 ->
-       (Zero [s-s%16]
-               (OffPtr <ptr.Type> ptr [s%16])
-               (Zero [s%16] ptr mem))
+(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 ->
+       (Zero [8]
+               (OffPtr <ptr.Type> ptr [s-8])
+               (Zero [s-s%16] ptr mem))
+(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 ->
+       (Zero [16]
+               (OffPtr <ptr.Type> ptr [s-16])
+               (Zero [s-s%16] ptr mem))
 
 // medium zeroing uses a duff device
 // 4, 16, and 64 are magic constants, see runtime/mkduff.go
index 1bb21d8a2c30f9b9f178d2ac4f8192a4f5ec385d..53331eda31218f8b964b63cf0c0c949eb4e44787 100644 (file)
@@ -18551,24 +18551,48 @@ func rewriteValueARM64_OpZero_20(v *Value) bool {
        config := b.Func.Config
        _ = config
        // match: (Zero [s] ptr mem)
-       // cond: s%16 != 0 && s > 16
-       // result: (Zero [s-s%16] (OffPtr <ptr.Type> ptr [s%16]) (Zero [s%16] ptr mem))
+       // cond: s%16 != 0 && s%16 <= 8 && s > 16
+       // result: (Zero [8] (OffPtr <ptr.Type> ptr [s-8]) (Zero [s-s%16] ptr mem))
        for {
                s := v.AuxInt
                _ = v.Args[1]
                ptr := v.Args[0]
                mem := v.Args[1]
-               if !(s%16 != 0 && s > 16) {
+               if !(s%16 != 0 && s%16 <= 8 && s > 16) {
                        break
                }
                v.reset(OpZero)
-               v.AuxInt = s - s%16
+               v.AuxInt = 8
+               v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
+               v0.AuxInt = s - 8
+               v0.AddArg(ptr)
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
+               v1.AuxInt = s - s%16
+               v1.AddArg(ptr)
+               v1.AddArg(mem)
+               v.AddArg(v1)
+               return true
+       }
+       // match: (Zero [s] ptr mem)
+       // cond: s%16 != 0 && s%16 > 8 && s > 16
+       // result: (Zero [16] (OffPtr <ptr.Type> ptr [s-16]) (Zero [s-s%16] ptr mem))
+       for {
+               s := v.AuxInt
+               _ = v.Args[1]
+               ptr := v.Args[0]
+               mem := v.Args[1]
+               if !(s%16 != 0 && s%16 > 8 && s > 16) {
+                       break
+               }
+               v.reset(OpZero)
+               v.AuxInt = 16
                v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
-               v0.AuxInt = s % 16
+               v0.AuxInt = s - 16
                v0.AddArg(ptr)
                v.AddArg(v0)
                v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
-               v1.AuxInt = s 16
+               v1.AuxInt = s - s%16
                v1.AddArg(ptr)
                v1.AddArg(mem)
                v.AddArg(v1)