From 203e18883726409205df4af11935d6393ed9ad52 Mon Sep 17 00:00:00 2001 From: Richard Musiol Date: Sat, 27 Apr 2019 01:16:10 +0200 Subject: [PATCH] cmd/internal/obj/wasm: cache SP in a local MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/internal/obj/wasm/wasmobj.go | 67 ++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/src/cmd/internal/obj/wasm/wasmobj.go b/src/cmd/internal/obj/wasm/wasmobj.go index 0474e3b4b1..0ad883470e 100644 --- a/src/cmd/internal/obj/wasm/wasmobj.go +++ b/src/cmd/internal/obj/wasm/wasmobj.go @@ -774,21 +774,34 @@ func countRegisters(s *obj.LSym) (numI, numF int16) { func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { w := new(bytes.Buffer) - numI, numF := countRegisters(s) + hasLocalSP := false + var r0, f0 int16 // Function starts with declaration of locals: numbers and types. + // Some functions use a special calling convention. switch s.Name { - // memchr and memcmp don't use the normal Go calling convention and need i32 variables. - case "memchr": + case "wasm_export_run", "runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody": + writeUleb128(w, 0) // number of sets of locals + case "memchr", "memcmp": writeUleb128(w, 1) // number of sets of locals - writeUleb128(w, 3) // number of locals + writeUleb128(w, 2) // number of locals w.WriteByte(0x7F) // i32 - case "memcmp": + case "cmpbody": writeUleb128(w, 1) // number of sets of locals writeUleb128(w, 2) // number of locals - w.WriteByte(0x7F) // i32 + w.WriteByte(0x7E) // i64 + case "runtime.gcWriteBarrier": + writeUleb128(w, 1) // number of sets of locals + writeUleb128(w, 4) // number of locals + w.WriteByte(0x7E) // i64 default: - numTypes := 0 + // Normal calling convention: No WebAssembly parameters. First local variable is local SP cache. + hasLocalSP = true + numI, numF := countRegisters(s) + r0 = 1 + f0 = 1 + numI + + numTypes := 1 if numI > 0 { numTypes++ } @@ -797,6 +810,8 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { } writeUleb128(w, uint64(numTypes)) + writeUleb128(w, 1) // number of locals (SP) + w.WriteByte(0x7F) // i32 if numI > 0 { writeUleb128(w, uint64(numI)) // number of locals w.WriteByte(0x7E) // i64 @@ -807,6 +822,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { } } + if hasLocalSP { + // Copy SP from its global variable into a local variable. Accessing a local variable is more efficient. + updateLocalSP(w) + } for p := s.Func.Text; p != nil; p = p.Link { switch p.As { case AGet: @@ -815,15 +834,18 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { } reg := p.From.Reg switch { + case reg == REG_SP && hasLocalSP: + w.WriteByte(0x20) // local.get + writeUleb128(w, 0) // local SP case reg >= REG_PC_F && reg <= REG_PAUSE: w.WriteByte(0x23) // global.get writeUleb128(w, uint64(reg-REG_PC_F)) case reg >= REG_R0 && reg <= REG_R15: w.WriteByte(0x20) // local.get (i64) - writeUleb128(w, uint64(reg-REG_R0)) + writeUleb128(w, uint64(r0+(reg-REG_R0))) case reg >= REG_F0 && reg <= REG_F15: w.WriteByte(0x20) // local.get (f64) - writeUleb128(w, uint64(numI+(reg-REG_F0))) + writeUleb128(w, uint64(f0+(reg-REG_F0))) default: panic("bad Get: invalid register") } @@ -836,6 +858,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { reg := p.To.Reg switch { case reg >= REG_PC_F && reg <= REG_PAUSE: + if reg == REG_SP && hasLocalSP { + w.WriteByte(0x22) // local.tee + writeUleb128(w, 0) // local SP + } w.WriteByte(0x24) // global.set writeUleb128(w, uint64(reg-REG_PC_F)) case reg >= REG_R0 && reg <= REG_F15: @@ -846,9 +872,9 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { w.WriteByte(0x21) // local.set } if reg <= REG_R15 { - writeUleb128(w, uint64(reg-REG_R0)) + writeUleb128(w, uint64(r0+(reg-REG_R0))) } else { - writeUleb128(w, uint64(numI+(reg-REG_F0))) + writeUleb128(w, uint64(f0+(reg-REG_F0))) } default: panic("bad Set: invalid register") @@ -863,10 +889,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { switch { case reg >= REG_R0 && reg <= REG_R15: w.WriteByte(0x22) // local.tee (i64) - writeUleb128(w, uint64(reg-REG_R0)) + writeUleb128(w, uint64(r0+(reg-REG_R0))) case reg >= REG_F0 && reg <= REG_F15: w.WriteByte(0x22) // local.tee (f64) - writeUleb128(w, uint64(numI+(reg-REG_F0))) + writeUleb128(w, uint64(f0+(reg-REG_F0))) default: panic("bad Tee: invalid register") } @@ -942,6 +968,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { r.Type = objabi.R_WASMIMPORT } r.Sym = p.To.Sym + if hasLocalSP { + // The stack may have moved, which changes SP. Update the local SP variable. + updateLocalSP(w) + } default: panic("bad type for Call") @@ -950,6 +980,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { case ACallIndirect: writeUleb128(w, uint64(p.To.Offset)) w.WriteByte(0x00) // reserved value + if hasLocalSP { + // The stack may have moved, which changes SP. Update the local SP variable. + updateLocalSP(w) + } case AI32Const, AI64Const: if p.From.Name == obj.NAME_EXTERN { @@ -1001,6 +1035,13 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { s.P = w.Bytes() } +func updateLocalSP(w *bytes.Buffer) { + w.WriteByte(0x23) // global.get + writeUleb128(w, uint64(REG_SP-REG_PC_F)) // SP + w.WriteByte(0x21) // local.set + writeUleb128(w, 0) // local SP +} + func align(as obj.As) uint64 { switch as { case AI32Load8S, AI32Load8U, AI64Load8S, AI64Load8U, AI32Store8, AI64Store8: -- 2.48.1