]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/obj/wasm: cache SP in a local
authorRichard Musiol <mail@richard-musiol.de>
Fri, 26 Apr 2019 23:16:10 +0000 (01:16 +0200)
committerRichard Musiol <neelance@gmail.com>
Tue, 30 Apr 2019 11:29:43 +0000 (11:29 +0000)
We use Wasm global variables extensively for simulating
registers, especially SP. V8 does not handle global variables
efficiently.

This CL reduces global variable accesses by caching the global SP
in a local variable in each function. The local cache is set on
function entry and updated after each call (where the stack could
have moved). Within a function, the SP access will use the local
variable.

Supersedes https://golang.org/cl/173979.

Running on Chrome Version 73.0.3683.103 on darwin/amd64:

name                   old time/op    new time/op     delta
BinaryTree17              15.3s ± 2%      14.5s ± 3%   -5.20%  (p=0.000 n=9+10)
Fannkuch11                8.91s ± 2%      9.48s ± 2%   +6.41%  (p=0.000 n=9+10)
FmtFprintfEmpty           197ns ± 5%      165ns ± 3%  -16.09%  (p=0.000 n=9+8)
FmtFprintfString          354ns ± 8%      325ns ± 7%   -8.33%  (p=0.001 n=10+10)
FmtFprintfInt             400ns ± 4%      368ns ± 6%   -8.01%  (p=0.000 n=10+10)
FmtFprintfIntInt          618ns ± 3%      587ns ± 6%   -4.97%  (p=0.001 n=10+10)
FmtFprintfPrefixedInt     637ns ± 4%      606ns ± 4%   -4.88%  (p=0.000 n=10+10)
FmtFprintfFloat           965ns ± 7%      898ns ± 4%   -6.97%  (p=0.000 n=10+10)
FmtManyArgs              2.34µs ± 1%     2.24µs ± 3%   -4.40%  (p=0.000 n=9+10)
GobDecode                29.8ms ± 3%     28.8ms ± 6%   -3.60%  (p=0.006 n=9+10)
GobEncode                20.5ms ± 8%     17.6ms ± 3%  -14.32%  (p=0.000 n=10+10)
Gzip                      714ms ± 3%      718ms ± 8%     ~     (p=0.971 n=10+10)
Gunzip                    148ms ± 3%      136ms ± 3%   -7.99%  (p=0.000 n=10+9)
HTTPClientServer          219µs ± 3%      215µs ± 4%     ~     (p=0.190 n=10+10)
JSONEncode               35.1ms ± 2%     31.8ms ±13%   -9.52%  (p=0.002 n=10+10)
JSONDecode                220ms ± 3%      207ms ± 5%   -5.87%  (p=0.000 n=10+10)
Mandelbrot200            5.22ms ± 1%     5.11ms ± 4%   -2.11%  (p=0.027 n=8+10)
GoParse                  17.2ms ± 6%     16.1ms ± 5%   -6.63%  (p=0.000 n=10+9)
RegexpMatchEasy0_32       375ns ± 3%      340ns ± 3%   -9.25%  (p=0.000 n=10+10)
RegexpMatchEasy0_1K      2.70µs ± 3%     2.65µs ± 4%     ~     (p=0.118 n=10+10)
RegexpMatchEasy1_32       341ns ± 2%      305ns ± 4%  -10.62%  (p=0.000 n=9+10)
RegexpMatchEasy1_1K      3.20µs ± 3%     2.99µs ± 3%   -6.35%  (p=0.000 n=10+10)
RegexpMatchMedium_32      520ns ± 3%      501ns ± 4%   -3.64%  (p=0.002 n=9+10)
RegexpMatchMedium_1K      145µs ± 7%      128µs ± 3%  -11.57%  (p=0.000 n=9+10)
RegexpMatchHard_32       7.88µs ± 3%     7.01µs ± 5%  -10.97%  (p=0.000 n=10+10)
RegexpMatchHard_1K        237µs ± 5%      207µs ± 4%  -12.71%  (p=0.000 n=9+10)
Revcomp                   2.34s ± 1%      2.31s ± 5%     ~     (p=0.230 n=7+10)
Template                  261ms ± 7%      246ms ± 5%   -5.93%  (p=0.007 n=10+10)
TimeParse                1.47µs ± 3%     1.39µs ± 5%   -5.75%  (p=0.000 n=9+10)
TimeFormat               1.52µs ± 3%     1.43µs ± 4%   -6.42%  (p=0.000 n=8+10)

name                   old speed      new speed       delta
GobDecode              25.7MB/s ± 3%   26.7MB/s ± 5%   +3.77%  (p=0.006 n=9+10)
GobEncode              37.5MB/s ± 8%   43.7MB/s ± 3%  +16.61%  (p=0.000 n=10+10)
Gzip                   27.2MB/s ± 3%   27.0MB/s ± 7%     ~     (p=0.971 n=10+10)
Gunzip                  131MB/s ± 3%    142MB/s ± 5%   +8.07%  (p=0.000 n=10+10)
JSONEncode             55.2MB/s ± 2%   61.2MB/s ±12%  +10.80%  (p=0.002 n=10+10)
JSONDecode             8.84MB/s ± 3%   9.39MB/s ± 5%   +6.28%  (p=0.000 n=10+10)
GoParse                3.37MB/s ± 6%   3.61MB/s ± 5%   +7.09%  (p=0.000 n=10+9)
RegexpMatchEasy0_32    85.3MB/s ± 3%   94.0MB/s ± 3%  +10.20%  (p=0.000 n=10+10)
RegexpMatchEasy0_1K     379MB/s ± 3%    387MB/s ± 4%     ~     (p=0.123 n=10+10)
RegexpMatchEasy1_32    93.9MB/s ± 2%  105.1MB/s ± 4%  +11.96%  (p=0.000 n=9+10)
RegexpMatchEasy1_1K     320MB/s ± 3%    342MB/s ± 3%   +6.79%  (p=0.000 n=10+10)
RegexpMatchMedium_32   1.92MB/s ± 2%   2.00MB/s ± 3%   +3.94%  (p=0.001 n=9+10)
RegexpMatchMedium_1K   7.09MB/s ± 6%   8.01MB/s ± 3%  +13.00%  (p=0.000 n=9+10)
RegexpMatchHard_32     4.06MB/s ± 3%   4.56MB/s ± 5%  +12.38%  (p=0.000 n=10+10)
RegexpMatchHard_1K     4.32MB/s ± 4%   4.96MB/s ± 4%  +14.60%  (p=0.000 n=9+10)
Revcomp                 109MB/s ± 1%    110MB/s ± 5%     ~     (p=0.219 n=7+10)
Template               7.44MB/s ± 8%   7.91MB/s ± 5%   +6.30%  (p=0.007 n=10+10)

Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab
Reviewed-on: https://go-review.googlesource.com/c/go/+/174062
Run-TryBot: Richard Musiol <neelance@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/cmd/internal/obj/wasm/wasmobj.go

index 0474e3b4b1266b39ec8088be66d8c0cafcc40d12..0ad883470ef253760de376b2a952240b053f7efd 100644 (file)
@@ -774,21 +774,34 @@ func countRegisters(s *obj.LSym) (numI, numF int16) {
 func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
        w := new(bytes.Buffer)
 
-       numI, numF := countRegisters(s)
+       hasLocalSP := false
+       var r0, f0 int16
 
        // Function starts with declaration of locals: numbers and types.
+       // Some functions use a special calling convention.
        switch s.Name {
-       // memchr and memcmp don't use the normal Go calling convention and need i32 variables.
-       case "memchr":
+       case "wasm_export_run", "runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody":
+               writeUleb128(w, 0) // number of sets of locals
+       case "memchr", "memcmp":
                writeUleb128(w, 1) // number of sets of locals
-               writeUleb128(w, 3) // number of locals
+               writeUleb128(w, 2) // number of locals
                w.WriteByte(0x7F)  // i32
-       case "memcmp":
+       case "cmpbody":
                writeUleb128(w, 1) // number of sets of locals
                writeUleb128(w, 2) // number of locals
-               w.WriteByte(0x7F)  // i32
+               w.WriteByte(0x7E)  // i64
+       case "runtime.gcWriteBarrier":
+               writeUleb128(w, 1) // number of sets of locals
+               writeUleb128(w, 4) // number of locals
+               w.WriteByte(0x7E)  // i64
        default:
-               numTypes := 0
+               // Normal calling convention: No WebAssembly parameters. First local variable is local SP cache.
+               hasLocalSP = true
+               numI, numF := countRegisters(s)
+               r0 = 1
+               f0 = 1 + numI
+
+               numTypes := 1
                if numI > 0 {
                        numTypes++
                }
@@ -797,6 +810,8 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                }
 
                writeUleb128(w, uint64(numTypes))
+               writeUleb128(w, 1) // number of locals (SP)
+               w.WriteByte(0x7F)  // i32
                if numI > 0 {
                        writeUleb128(w, uint64(numI)) // number of locals
                        w.WriteByte(0x7E)             // i64
@@ -807,6 +822,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                }
        }
 
+       if hasLocalSP {
+               // Copy SP from its global variable into a local variable. Accessing a local variable is more efficient.
+               updateLocalSP(w)
+       }
        for p := s.Func.Text; p != nil; p = p.Link {
                switch p.As {
                case AGet:
@@ -815,15 +834,18 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                        }
                        reg := p.From.Reg
                        switch {
+                       case reg == REG_SP && hasLocalSP:
+                               w.WriteByte(0x20)  // local.get
+                               writeUleb128(w, 0) // local SP
                        case reg >= REG_PC_F && reg <= REG_PAUSE:
                                w.WriteByte(0x23) // global.get
                                writeUleb128(w, uint64(reg-REG_PC_F))
                        case reg >= REG_R0 && reg <= REG_R15:
                                w.WriteByte(0x20) // local.get (i64)
-                               writeUleb128(w, uint64(reg-REG_R0))
+                               writeUleb128(w, uint64(r0+(reg-REG_R0)))
                        case reg >= REG_F0 && reg <= REG_F15:
                                w.WriteByte(0x20) // local.get (f64)
-                               writeUleb128(w, uint64(numI+(reg-REG_F0)))
+                               writeUleb128(w, uint64(f0+(reg-REG_F0)))
                        default:
                                panic("bad Get: invalid register")
                        }
@@ -836,6 +858,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                        reg := p.To.Reg
                        switch {
                        case reg >= REG_PC_F && reg <= REG_PAUSE:
+                               if reg == REG_SP && hasLocalSP {
+                                       w.WriteByte(0x22)  // local.tee
+                                       writeUleb128(w, 0) // local SP
+                               }
                                w.WriteByte(0x24) // global.set
                                writeUleb128(w, uint64(reg-REG_PC_F))
                        case reg >= REG_R0 && reg <= REG_F15:
@@ -846,9 +872,9 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                                        w.WriteByte(0x21) // local.set
                                }
                                if reg <= REG_R15 {
-                                       writeUleb128(w, uint64(reg-REG_R0))
+                                       writeUleb128(w, uint64(r0+(reg-REG_R0)))
                                } else {
-                                       writeUleb128(w, uint64(numI+(reg-REG_F0)))
+                                       writeUleb128(w, uint64(f0+(reg-REG_F0)))
                                }
                        default:
                                panic("bad Set: invalid register")
@@ -863,10 +889,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                        switch {
                        case reg >= REG_R0 && reg <= REG_R15:
                                w.WriteByte(0x22) // local.tee (i64)
-                               writeUleb128(w, uint64(reg-REG_R0))
+                               writeUleb128(w, uint64(r0+(reg-REG_R0)))
                        case reg >= REG_F0 && reg <= REG_F15:
                                w.WriteByte(0x22) // local.tee (f64)
-                               writeUleb128(w, uint64(numI+(reg-REG_F0)))
+                               writeUleb128(w, uint64(f0+(reg-REG_F0)))
                        default:
                                panic("bad Tee: invalid register")
                        }
@@ -942,6 +968,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                                        r.Type = objabi.R_WASMIMPORT
                                }
                                r.Sym = p.To.Sym
+                               if hasLocalSP {
+                                       // The stack may have moved, which changes SP. Update the local SP variable.
+                                       updateLocalSP(w)
+                               }
 
                        default:
                                panic("bad type for Call")
@@ -950,6 +980,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
                case ACallIndirect:
                        writeUleb128(w, uint64(p.To.Offset))
                        w.WriteByte(0x00) // reserved value
+                       if hasLocalSP {
+                               // The stack may have moved, which changes SP. Update the local SP variable.
+                               updateLocalSP(w)
+                       }
 
                case AI32Const, AI64Const:
                        if p.From.Name == obj.NAME_EXTERN {
@@ -1001,6 +1035,13 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
        s.P = w.Bytes()
 }
 
+func updateLocalSP(w *bytes.Buffer) {
+       w.WriteByte(0x23)                        // global.get
+       writeUleb128(w, uint64(REG_SP-REG_PC_F)) // SP
+       w.WriteByte(0x21)                        // local.set
+       writeUleb128(w, 0)                       // local SP
+}
+
 func align(as obj.As) uint64 {
        switch as {
        case AI32Load8S, AI32Load8U, AI64Load8S, AI64Load8U, AI32Store8, AI64Store8: